diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b561659..0927123 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -28,49 +28,76 @@
 #include "arm_gemm.hpp"
 
 #include "gemm_common.hpp"
+#include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
 
+#include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_hgemm_24x8.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
 #include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/sve_native_fp16_mla_4VLx4.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
 #if defined(__ARM_FEATURE_SVE)
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_fp16_mla_4VLx4",
+    [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<__fp16> &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
+    GemmMethod::GEMM_NATIVE,
+    "native_fp16_mla_4VLx4",
+    [](const GemmArgs<__fp16> &args) { return (args._Ksize >= 8 && args._alpha==1.0f && !args._trA && !args._trB); },
+    [](const GemmArgs<__fp16> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<__fp16> &args) { return new GemmNative<native_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_fp16_mla_3VLx8",
     [](const GemmArgs<__fp16> &args) { return (args._Ksize > 4); },
-    [](const GemmArgs<__fp16> &args) { return true; },
+    nullptr,
     [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
 },
 #endif
+
 #if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
 {
     GemmMethod::GEMM_INTERLEAVED,
     "hgemm_24x8",
-    [](const GemmArgs<__fp16> &args) {
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        return args._ci->has_fp16();
+    [](const GemmArgs<__fp16> &args) { return args._ci->has_fp16(); },
 #else
-        return true;
+    nullptr,
 #endif
-    },
-    [](const GemmArgs<__fp16> &args) { return true; },
+    nullptr,
     [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
 },
 #endif
-#if defined(__arm__)
+#ifdef __aarch64__
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "sgemm_12x8",
+    nullptr,
+    nullptr,
+    [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+},
+#elif defined(__arm__)
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sgemm_8x6",
-    [](const GemmArgs<__fp16> &args) { return true; },
-    [](const GemmArgs<__fp16> &args) { return true; },
+    nullptr,
+    nullptr,
     [](const GemmArgs<__fp16> &args) { return new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(args); }
 },
+#else // not AArch64 or AArch32
+# error Unknown Architecture
 #endif
 {
     GemmMethod::DEFAULT,
@@ -90,8 +117,8 @@
 template UniqueGemmCommon<__fp16, __fp16> gemm<__fp16, __fp16>(const GemmArgs<__fp16> &args);
 template KernelDescription get_gemm_method<__fp16, __fp16>(const GemmArgs<__fp16> &args);
 template bool method_is_compatible<__fp16, __fp16>(GemmMethod method, const GemmArgs<__fp16> &args);
-template std::vector<std::string> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16, __fp16> (const GemmArgs<__fp16> &args);
 
 } // namespace arm_gemm
 
-#endif // __ARM_FP16_ARGS
\ No newline at end of file
+#endif // __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 8bc33cc..6869279 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -32,6 +32,7 @@
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/a64_sgemm_native_16x4.hpp"
 #include "kernels/a64_sgemm_nativeA_pretransposeB_16x4.hpp"
@@ -112,6 +113,13 @@
     [](const GemmArgs<float> &args) { return new GemmHybrid<sgemm_nativeA_pretransposeB_16x4, float, float>(args); }
 },
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_fp32_mla_16x4",
+    [](const GemmArgs<float> &args) { return (args._Ksize >= 4) && (args._alpha == 1.0f) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs<float> &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<float> &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+},
+{
     GemmMethod::GEMM_NATIVE,
     "sgemm_native_16x4",
     [](const GemmArgs<float> &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && args._alpha==1.0f && !args._trA && !args._trB); },
@@ -165,6 +173,6 @@
 template UniqueGemmCommon<float, float> gemm<float, float>(const GemmArgs<float> &args);
 template KernelDescription get_gemm_method<float, float>(const GemmArgs<float> &args);
 template bool method_is_compatible<float, float>(GemmMethod method, const GemmArgs<float> &args);
-template std::vector<std::string> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
+template std::vector<KernelDescription> get_compatible_kernels<float, float> (const GemmArgs<float> &args);
 
-} // namespace arm_gemm
\ No newline at end of file
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index c2bd0bb..82e0625 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -208,7 +208,6 @@
         return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
         _B_transposed = buffer;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index bf80784..d952140 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -112,8 +112,12 @@
 }
 
 template<typename Top, typename Tret>
-std::vector<std::string> get_compatible_kernels(const GemmArgs<Tret> &args) {
-    std::vector<std::string> res;
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs<Tret> &args) {
+    std::vector<KernelDescription> res;
+
+    /* Find out what the default implementation in so we can set the flag accordingly later. */
+    const GemmImplementation<Top, Tret> *default_impl;
+    find_implementation(args, default_impl);
 
     auto gemms = gemm_implementation_list<Top, Tret>();
 
@@ -123,7 +127,7 @@
             continue;
         }
 
-        res.push_back(i->name);
+        res.push_back(KernelDescription(i->method, i->name, i==default_impl));
     }
 
     return res;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index b4503dd..0db0654 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -58,7 +58,7 @@
 template UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const GemmArgs<int32_t> &args);
 template KernelDescription get_gemm_method<int16_t, int32_t>(const GemmArgs<int32_t> &args);
 template bool method_is_compatible<int16_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int16_t, int32_t> (const GemmArgs<int32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 5811c2a..9e49df1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "kernels/a64_gemm_s8_12x8.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
 #include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
 
@@ -42,6 +43,13 @@
 static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_s8s32_dot_4VLx4",
+    [](const GemmArgs<int32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<int32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+},
+{
     GemmMethod::GEMM_NATIVE,
     "native_s8s32_dot_4VLx4",
     [](const GemmArgs<int32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<int32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
     [](const GemmArgs<int32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs<int32_t> &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
 },
@@ -95,7 +103,7 @@
 template UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const GemmArgs<int32_t> &args);
 template KernelDescription get_gemm_method<int8_t, int32_t>(const GemmArgs<int32_t> &args);
 template bool method_is_compatible<int8_t, int32_t>(GemmMethod method, const GemmArgs<int32_t> &args);
-template std::vector<std::string> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int32_t> (const GemmArgs<int32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index b83ccd3..a773166 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -480,7 +480,6 @@
         return total;
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 6bcbca9..9e3e4e4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -58,7 +58,7 @@
 template UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template KernelDescription get_gemm_method<uint16_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template bool method_is_compatible<uint16_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint16_t, uint32_t> (const GemmArgs<uint32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index b95ca80..9321bfc 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 #include "kernels/a64_gemm_u8_12x8.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
 #include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
 #include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
 
@@ -42,6 +43,13 @@
 static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
 {
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_u8u32_dot_4VLx4",
+    [](const GemmArgs<uint32_t> &args) { return args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<uint32_t> &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+},
+{
     GemmMethod::GEMM_NATIVE,
     "native_u8u32_dot_4VLx4",
     [](const GemmArgs<uint32_t> &args) { return (args._Ksize>=16 && args._alpha==1 && !args._trA && !args._trB); },
@@ -59,7 +67,7 @@
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && (args._Ksize % 16 == 0) && (args._Nsize % 16 == 0) && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs<uint32_t> &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
     [](const GemmArgs<uint32_t> &args) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs<uint32_t> &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
 },
@@ -95,7 +103,7 @@
 template UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template KernelDescription get_gemm_method<uint8_t, uint32_t>(const GemmArgs<uint32_t> &args);
 template bool method_is_compatible<uint8_t, uint32_t>(GemmMethod method, const GemmArgs<uint32_t> &args);
-template std::vector<std::string> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint32_t> (const GemmArgs<uint32_t> &args);
 
 } // namespace arm_gemm
 
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 32d668f..b7f9de8 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -44,10 +44,9 @@
         _subgemm = gemm<To,Tr>(newargs);
     }
 
-    using GemmCommon<To, Tr>::set_arrays;
     void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
                     const To *B, const int ldb, const int B_multi_stride,
-                          Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
+                    Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
         /* A and C's batch stride becomes their new row stride.  New batch stride is 0 as nbatches for subgemm is always 1. */
         _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
                              B, ldb, B_multi_stride,
@@ -86,7 +85,6 @@
         return _subgemm->get_B_pretransposed_array_size();
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
     }
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index f7beb0a..21f8278 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -148,7 +148,6 @@
         return _buffer_per_multi * _nmultis * sizeof(To);
     }
 
-    using GemmCommon<To, Tr>::pretranspose_B_array;
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
 
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
new file mode 100644
index 0000000..5605939
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+
+#include "../std_transforms_fixed.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, float, int, int, int);
+void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, float, int, int, int);
+
+class hybrid_fp32_mla_16x4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return 16;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=a64_hybrid_fp32_mla_16x4;
+
+    hybrid_fp32_mla_16x4(const CPUInfo *ci)
+    {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
+            kernel = a64_hybrid_fp32_mla_16x4_a55;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
new file mode 100644
index 0000000..7261761
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
@@ -0,0 +1,2352 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long blocks_count = K / 1;
+
+    for (int y=0; y<M; y+=4) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const float *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long blocks = blocks_count;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            float result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+            float *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "temploadreg0 .req X0\n"
+                        "temploadreg1 .req X1\n"
+                        "temploadreg2 .req X2\n"
+                        "temploadreg3 .req X3\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "temploadreg0 .req X2\n"
+                        "temploadreg1 .req X3\n"
+                        "temploadreg2 .req X4\n"
+                        "temploadreg3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "temploadreg0 .req X4\n"
+                        "temploadreg1 .req X5\n"
+                        "temploadreg2 .req X6\n"
+                        "temploadreg3 .req X7\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "temploadreg0 .req X6\n"
+                        "temploadreg1 .req X7\n"
+                        "temploadreg2 .req X8\n"
+                        "temploadreg3 .req X9\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "movi v28.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "fmul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr d0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ins v0.d[1], temploadreg0\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr d1, [a_ptr1, #-0x10]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ins v1.d[1], temploadreg1\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr d2, [a_ptr2, #-0x10]\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ins v2.d[1], temploadreg2\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr d3, [a_ptr3, #-0x10]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ins v3.d[1], temploadreg3\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr d4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr d5, [a_ptr1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr d6, [a_ptr2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr d7, [a_ptr3]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ins v4.d[1], temploadreg0\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ins v5.d[1], temploadreg1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ins v6.d[1], temploadreg2\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ins v7.d[1], temploadreg3\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr d8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr d9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr d10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr d11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr d12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr d13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr d14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr d15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr d8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ins v8.d[1], temploadreg0\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr d9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ins v9.d[1], temploadreg1\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr d10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ins v10.d[1], temploadreg2\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr d11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ins v11.d[1], temploadreg3\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr d12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ins v12.d[1], temploadreg0\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr d13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ins v13.d[1], temploadreg1\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr d14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ins v14.d[1], temploadreg2\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr d15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ins v15.d[1], temploadreg3\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr s3, [a_ptr3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x4\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq temploadreg0\n"
+                        ".unreq temploadreg1\n"
+                        ".unreq temploadreg2\n"
+                        ".unreq temploadreg3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    );
+                    break;
+            }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
new file mode 100644
index 0000000..504769b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
@@ -0,0 +1,1726 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, float beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long blocks_count = K / 1;
+
+    for (int y=0; y<M; y+=4) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            const float *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long blocks = blocks_count;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            float result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+            float *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v26.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v27.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbz %[beta0], 1f\n"
+                        "movi v16.4s, #0\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "movi v17.4s, #0\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "movi v18.4s, #0\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "movi v19.4s, #0\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "movi v20.4s, #0\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "movi v21.4s, #0\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "movi v22.4s, #0\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "movi v23.4s, #0\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "movi v24.4s, #0\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "movi v25.4s, #0\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "movi v26.4s, #0\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "movi v27.4s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "movi v28.4s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "movi v29.4s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "movi v30.4s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "movi v31.4s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1r {v15.4s}, [%[betaptr]]\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "fmul v16.4s, v16.4s, v15.4s\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "fmul v17.4s, v17.4s, v15.4s\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "fmul v18.4s, v18.4s, v15.4s\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "fmul v19.4s, v19.4s, v15.4s\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "fmul v20.4s, v20.4s, v15.4s\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "fmul v21.4s, v21.4s, v15.4s\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "fmul v22.4s, v22.4s, v15.4s\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "fmul v23.4s, v23.4s, v15.4s\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "fmul v24.4s, v24.4s, v15.4s\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "fmul v25.4s, v25.4s, v15.4s\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "fmul v26.4s, v26.4s, v15.4s\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "fmul v27.4s, v27.4s, v15.4s\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "fmul v28.4s, v28.4s, v15.4s\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "fmul v29.4s, v29.4s, v15.4s\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "fmul v30.4s, v30.4s, v15.4s\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "fmul v31.4s, v31.4s, v15.4s\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q3, [a_ptr3, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "ldr q12, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "ldr q13, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "ldr q14, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "ldr q15, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v4.s[1]\n"
+                        "fmla v20.4s, v12.4s, v5.s[1]\n"
+                        "fmla v24.4s, v12.4s, v6.s[1]\n"
+                        "fmla v28.4s, v12.4s, v7.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v4.s[1]\n"
+                        "fmla v21.4s, v13.4s, v5.s[1]\n"
+                        "fmla v25.4s, v13.4s, v6.s[1]\n"
+                        "fmla v29.4s, v13.4s, v7.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v4.s[1]\n"
+                        "fmla v22.4s, v14.4s, v5.s[1]\n"
+                        "fmla v26.4s, v14.4s, v6.s[1]\n"
+                        "fmla v30.4s, v14.4s, v7.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v4.s[1]\n"
+                        "fmla v23.4s, v15.4s, v5.s[1]\n"
+                        "fmla v27.4s, v15.4s, v6.s[1]\n"
+                        "fmla v31.4s, v15.4s, v7.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "fmla v16.4s, v12.4s, v4.s[3]\n"
+                        "fmla v20.4s, v12.4s, v5.s[3]\n"
+                        "fmla v24.4s, v12.4s, v6.s[3]\n"
+                        "fmla v28.4s, v12.4s, v7.s[3]\n"
+                        "fmla v17.4s, v13.4s, v4.s[3]\n"
+                        "fmla v21.4s, v13.4s, v5.s[3]\n"
+                        "fmla v25.4s, v13.4s, v6.s[3]\n"
+                        "fmla v29.4s, v13.4s, v7.s[3]\n"
+                        "fmla v18.4s, v14.4s, v4.s[3]\n"
+                        "fmla v22.4s, v14.4s, v5.s[3]\n"
+                        "fmla v26.4s, v14.4s, v6.s[3]\n"
+                        "fmla v30.4s, v14.4s, v7.s[3]\n"
+                        "fmla v19.4s, v15.4s, v4.s[3]\n"
+                        "fmla v23.4s, v15.4s, v5.s[3]\n"
+                        "fmla v27.4s, v15.4s, v6.s[3]\n"
+                        "fmla v31.4s, v15.4s, v7.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v12.4s, v0.s[1]\n"
+                        "fmla v20.4s, v12.4s, v1.s[1]\n"
+                        "fmla v24.4s, v12.4s, v2.s[1]\n"
+                        "fmla v28.4s, v12.4s, v3.s[1]\n"
+                        "ldr q12, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v13.4s, v0.s[1]\n"
+                        "fmla v21.4s, v13.4s, v1.s[1]\n"
+                        "fmla v25.4s, v13.4s, v2.s[1]\n"
+                        "fmla v29.4s, v13.4s, v3.s[1]\n"
+                        "ldr q13, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v14.4s, v0.s[1]\n"
+                        "fmla v22.4s, v14.4s, v1.s[1]\n"
+                        "fmla v26.4s, v14.4s, v2.s[1]\n"
+                        "fmla v30.4s, v14.4s, v3.s[1]\n"
+                        "ldr q14, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v15.4s, v0.s[1]\n"
+                        "fmla v23.4s, v15.4s, v1.s[1]\n"
+                        "fmla v27.4s, v15.4s, v2.s[1]\n"
+                        "fmla v31.4s, v15.4s, v3.s[1]\n"
+                        "ldr q15, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "fmla v16.4s, v12.4s, v0.s[3]\n"
+                        "fmla v20.4s, v12.4s, v1.s[3]\n"
+                        "fmla v24.4s, v12.4s, v2.s[3]\n"
+                        "fmla v28.4s, v12.4s, v3.s[3]\n"
+                        "fmla v17.4s, v13.4s, v0.s[3]\n"
+                        "fmla v21.4s, v13.4s, v1.s[3]\n"
+                        "fmla v25.4s, v13.4s, v2.s[3]\n"
+                        "fmla v29.4s, v13.4s, v3.s[3]\n"
+                        "fmla v18.4s, v14.4s, v0.s[3]\n"
+                        "fmla v22.4s, v14.4s, v1.s[3]\n"
+                        "fmla v26.4s, v14.4s, v2.s[3]\n"
+                        "fmla v30.4s, v14.4s, v3.s[3]\n"
+                        "fmla v19.4s, v15.4s, v0.s[3]\n"
+                        "fmla v23.4s, v15.4s, v1.s[3]\n"
+                        "fmla v27.4s, v15.4s, v2.s[3]\n"
+                        "fmla v31.4s, v15.4s, v3.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr s3, [a_ptr3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x4\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
index 48bf842..17f6e57 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(int32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const int32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const int8_t *a_ptr0 = a_ptr0_base;
             const int8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            int32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+            int32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "temploadreg0 .req X0\n"
+                    "temploadreg1 .req X1\n"
+                    "temploadreg2 .req X2\n"
+                    "temploadreg3 .req X3\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "temploadreg0 .req X2\n"
+                    "temploadreg1 .req X3\n"
+                    "temploadreg2 .req X4\n"
+                    "temploadreg3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "temploadreg0 .req X4\n"
+                    "temploadreg1 .req X5\n"
+                    "temploadreg2 .req X6\n"
+                    "temploadreg3 .req X7\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d3, [a_ptr3, #0x10]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x18]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "temploadreg0 .req X6\n"
+                    "temploadreg1 .req X7\n"
+                    "temploadreg2 .req X8\n"
+                    "temploadreg3 .req X9\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v28.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr d3, [a_ptr3, #-0x10]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v3.d[1], temploadreg3\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
index 0179139..fdd45a0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(int32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const int32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const int8_t *a_ptr0 = a_ptr0_base;
             const int8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            int32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
+            int32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #0x10]\n"
-                        ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v28.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q3, [a_ptr3, #-0x10]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
index 230ecdc..487cfa0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
@@ -37,2235 +37,2432 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(uint32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const uint32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const uint8_t *a_ptr0 = a_ptr0_base;
             const uint8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            uint32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+            uint32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "temploadreg0 .req X0\n"
-                        "temploadreg1 .req X1\n"
-                        "temploadreg2 .req X2\n"
-                        "temploadreg3 .req X3\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "temploadreg0 .req X0\n"
+                    "temploadreg1 .req X1\n"
+                    "temploadreg2 .req X2\n"
+                    "temploadreg3 .req X3\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "temploadreg0 .req X2\n"
-                        "temploadreg1 .req X3\n"
-                        "temploadreg2 .req X4\n"
-                        "temploadreg3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "temploadreg0 .req X2\n"
+                    "temploadreg1 .req X3\n"
+                    "temploadreg2 .req X4\n"
+                    "temploadreg3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "temploadreg0 .req X4\n"
-                        "temploadreg1 .req X5\n"
-                        "temploadreg2 .req X6\n"
-                        "temploadreg3 .req X7\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "temploadreg0 .req X4\n"
+                    "temploadreg1 .req X5\n"
+                    "temploadreg2 .req X6\n"
+                    "temploadreg3 .req X7\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "temploadreg0 .req X6\n"
-                        "temploadreg1 .req X7\n"
-                        "temploadreg2 .req X8\n"
-                        "temploadreg3 .req X9\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "movi v28.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr d0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr d1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr d2, [a_ptr2, #-0x10]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr d3, [a_ptr3, #-0x10]\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr d8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr d9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr d10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr d11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr d12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr d13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x18]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr d14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr d0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr d1, [a_ptr1, #0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x18]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr d15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr d2, [a_ptr2, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x18]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr d3, [a_ptr3, #0x10]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x18]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ins v0.d[1], temploadreg0\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ins v1.d[1], temploadreg1\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ins v2.d[1], temploadreg2\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ins v3.d[1], temploadreg3\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr d4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr d5, [a_ptr1]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr d6, [a_ptr2]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr d7, [a_ptr3]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr d8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ins v4.d[1], temploadreg0\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr d9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ins v5.d[1], temploadreg1\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr d10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ins v6.d[1], temploadreg2\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr d11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ins v7.d[1], temploadreg3\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr d12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ins v8.d[1], temploadreg0\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr d13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ins v9.d[1], temploadreg1\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ins v10.d[1], temploadreg2\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr d14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ins v11.d[1], temploadreg3\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr d15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ins v12.d[1], temploadreg0\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ins v13.d[1], temploadreg1\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ins v14.d[1], temploadreg2\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ins v15.d[1], temploadreg3\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        ".unreq temploadreg0\n"
-                        ".unreq temploadreg1\n"
-                        ".unreq temploadreg2\n"
-                        ".unreq temploadreg3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "temploadreg0 .req X6\n"
+                    "temploadreg1 .req X7\n"
+                    "temploadreg2 .req X8\n"
+                    "temploadreg3 .req X9\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "movi v28.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr d0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v0.d[1], temploadreg0\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr d1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #-0x8]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v1.d[1], temploadreg1\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr d2, [a_ptr2, #-0x10]\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [a_ptr2, #-0x8]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v2.d[1], temploadreg2\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr d3, [a_ptr3, #-0x10]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [a_ptr3, #-0x8]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ins v3.d[1], temploadreg3\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr d4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr d5, [a_ptr1]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr temploadreg1, [a_ptr1, #0x8]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr d6, [a_ptr2]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr temploadreg2, [a_ptr2, #0x8]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr d7, [a_ptr3]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr temploadreg3, [a_ptr3, #0x8]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ins v4.d[1], temploadreg0\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ins v5.d[1], temploadreg1\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ins v6.d[1], temploadreg2\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v7.d[1], temploadreg3\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr d8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr d9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr d10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr d11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr d12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr d13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr d14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr d15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr d8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ins v8.d[1], temploadreg0\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr d9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ins v9.d[1], temploadreg1\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr d10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ins v10.d[1], temploadreg2\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr d11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ins v11.d[1], temploadreg3\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr d12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ins v12.d[1], temploadreg0\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr d13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ins v13.d[1], temploadreg1\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr d14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ins v14.d[1], temploadreg2\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr d15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ins v15.d[1], temploadreg3\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    ".unreq temploadreg0\n"
+                    ".unreq temploadreg1\n"
+                    ".unreq temploadreg2\n"
+                    ".unreq temploadreg3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
index dbef029..87f46bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
@@ -37,1569 +37,1806 @@
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
     const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long blocks_count = K / 4;
+    const long odds_count = K - (blocks_count * 4);
 
     for (int y=0; y<M; y+=4) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
-        const unsigned long ldcb = ldc * sizeof(uint32_t);
 
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             const uint32_t *betaptr = &beta;
             long loops = loops_count;
             long regs = regs_count;
+            long blocks = blocks_count;
+            long odds = odds_count;
             const uint8_t *a_ptr0 = a_ptr0_base;
             const uint8_t *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            uint32_t result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
+            uint32_t *c_ptr_real = c_ptr0;
+            if (use_result_buffer && !beta0) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
 
             switch(M-y) {
                 case 1:
                     __asm __volatile (
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
                 case 2:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "c_ptr1 .req X1\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq c_ptr1\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
                 case 3:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v26.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v27.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "c_ptr1 .req X2\n"
+                    "c_ptr2 .req X3\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v26.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v27.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
                 default:
                 case 4:
                     __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbz %[beta0], 1f\n"
-                        "movi v16.4s, #0\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "movi v17.4s, #0\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "movi v18.4s, #0\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "movi v19.4s, #0\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "movi v20.4s, #0\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "movi v21.4s, #0\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "movi v22.4s, #0\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "movi v23.4s, #0\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "movi v24.4s, #0\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "movi v25.4s, #0\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "movi v26.4s, #0\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "movi v27.4s, #0\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "movi v28.4s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "movi v29.4s, #0\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "movi v30.4s, #0\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "movi v31.4s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "b 3f\n"
-                        "1:\n"
-                        "ld1r {v15.4s}, [%[betaptr]]\n"
-                        "ldr q16, [%[c_ptr0]]\n"
-                        "ldr q17, [%[c_ptr0], #0x10]\n"
-                        "ldr q18, [%[c_ptr0], #0x20]\n"
-                        "ldr q19, [%[c_ptr0], #0x30]\n"
-                        "mul v16.4s, v16.4s, v15.4s\n"
-                        "ldr q20, [c_ptr1]\n"
-                        "mul v17.4s, v17.4s, v15.4s\n"
-                        "ldr q21, [c_ptr1, #0x10]\n"
-                        "mul v18.4s, v18.4s, v15.4s\n"
-                        "ldr q22, [c_ptr1, #0x20]\n"
-                        "mul v19.4s, v19.4s, v15.4s\n"
-                        "ldr q23, [c_ptr1, #0x30]\n"
-                        "mul v20.4s, v20.4s, v15.4s\n"
-                        "ldr q24, [c_ptr2]\n"
-                        "mul v21.4s, v21.4s, v15.4s\n"
-                        "ldr q25, [c_ptr2, #0x10]\n"
-                        "mul v22.4s, v22.4s, v15.4s\n"
-                        "ldr q26, [c_ptr2, #0x20]\n"
-                        "mul v23.4s, v23.4s, v15.4s\n"
-                        "ldr q27, [c_ptr2, #0x30]\n"
-                        "mul v24.4s, v24.4s, v15.4s\n"
-                        "ldr q28, [c_ptr3]\n"
-                        "mul v25.4s, v25.4s, v15.4s\n"
-                        "ldr q29, [c_ptr3, #0x10]\n"
-                        "mul v26.4s, v26.4s, v15.4s\n"
-                        "ldr q30, [c_ptr3, #0x20]\n"
-                        "mul v27.4s, v27.4s, v15.4s\n"
-                        "ldr q31, [c_ptr3, #0x30]\n"
-                        "mul v28.4s, v28.4s, v15.4s\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mul v29.4s, v29.4s, v15.4s\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mul v30.4s, v30.4s, v15.4s\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mul v31.4s, v31.4s, v15.4s\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
-                        "cbz %[loops], 2f\n"
-                        "3:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b.ne 3b\n"
-                        "2:\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 4f\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        "ldr q8, [%[b_ptr0], #-0x80]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        "ldr q9, [%[b_ptr0], #-0x70]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        "ldr q10, [%[b_ptr0], #-0x60]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        "ldr q11, [%[b_ptr0], #-0x50]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        "ldr q12, [%[b_ptr0], #-0x40]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        "ldr q13, [%[b_ptr0], #-0x30]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        "ldr q14, [%[b_ptr0], #-0x20]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        "ldr q0, [%[a_ptr0], #0x10]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        "ldr q1, [a_ptr1, #0x10]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        "ldr q2, [a_ptr2, #0x10]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "ldr q15, [%[b_ptr0], #-0x10]\n"
-                        ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
-                        "ldr q3, [a_ptr3, #0x10]\n"
-                        ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
-                        ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
-                        ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
-                        ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
-                        ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
-                        ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
-                        ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
-                        ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
-                        ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
-                        ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
-                        ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
-                        ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
-                        ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
-                        ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
-                        ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
-                        ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
-                        ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
-                        ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
-                        ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
-                        ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
-                        ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
-                        ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
-                        ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
-                        ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
-                        ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
-                        ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
-                        ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
-                        ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
-                        ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
-                        ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
-                        ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
-                        ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
-                        ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
-                        ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
-                        ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
-                        "b 5f\n"
-                        "4:\n"
-                        ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
-                        ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
-                        ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
-                        ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
-                        ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
-                        ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
-                        ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
-                        ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
-                        ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
-                        "ldr q12, [%[b_ptr0], #0x40]\n"
-                        ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x50]\n"
-                        ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x60]\n"
-                        ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
-                        ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
-                        ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
-                        ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x70]\n"
-                        ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
-                        ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
-                        ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
-                        ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
-                        ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
-                        ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
-                        ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
-                        ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
-                        ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
-                        ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
-                        ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
-                        ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
-                        ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
-                        ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
-                        ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
-                        ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
-                        ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
-                        ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
-                        ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
-                        ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
-                        ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
-                        "5:\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    "a_ptr1 .req X0\n"
+                    "a_ptr2 .req X1\n"
+                    "a_ptr3 .req X2\n"
+                    "c_ptr1 .req X3\n"
+                    "c_ptr2 .req X4\n"
+                    "c_ptr3 .req X5\n"
+                    "add a_ptr1, %[a_ptr0], %[lda]\n"
+                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                    "add a_ptr2, a_ptr1, %[lda]\n"
+                    "add c_ptr2, c_ptr1, %[ldc]\n"
+                    "add a_ptr3, a_ptr2, %[lda]\n"
+                    "add c_ptr3, c_ptr2, %[ldc]\n"
+                    "cbz %[beta0], 1f\n"
+                    "movi v16.4s, #0\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "movi v17.4s, #0\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "movi v18.4s, #0\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "movi v19.4s, #0\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "movi v20.4s, #0\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "movi v21.4s, #0\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "movi v22.4s, #0\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "movi v23.4s, #0\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "movi v24.4s, #0\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "movi v25.4s, #0\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "movi v26.4s, #0\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "movi v27.4s, #0\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "movi v28.4s, #0\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "movi v29.4s, #0\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "movi v30.4s, #0\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "movi v31.4s, #0\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "b 3f\n"
+                    "1:\n"
+                    "ld1r {v15.4s}, [%[betaptr]]\n"
+                    "ldr q16, [%[c_ptr0]]\n"
+                    "ldr q17, [%[c_ptr0], #0x10]\n"
+                    "ldr q18, [%[c_ptr0], #0x20]\n"
+                    "ldr q19, [%[c_ptr0], #0x30]\n"
+                    "mul v16.4s, v16.4s, v15.4s\n"
+                    "ldr q20, [c_ptr1]\n"
+                    "mul v17.4s, v17.4s, v15.4s\n"
+                    "ldr q21, [c_ptr1, #0x10]\n"
+                    "mul v18.4s, v18.4s, v15.4s\n"
+                    "ldr q22, [c_ptr1, #0x20]\n"
+                    "mul v19.4s, v19.4s, v15.4s\n"
+                    "ldr q23, [c_ptr1, #0x30]\n"
+                    "mul v20.4s, v20.4s, v15.4s\n"
+                    "ldr q24, [c_ptr2]\n"
+                    "mul v21.4s, v21.4s, v15.4s\n"
+                    "ldr q25, [c_ptr2, #0x10]\n"
+                    "mul v22.4s, v22.4s, v15.4s\n"
+                    "ldr q26, [c_ptr2, #0x20]\n"
+                    "mul v23.4s, v23.4s, v15.4s\n"
+                    "ldr q27, [c_ptr2, #0x30]\n"
+                    "mul v24.4s, v24.4s, v15.4s\n"
+                    "ldr q28, [c_ptr3]\n"
+                    "mul v25.4s, v25.4s, v15.4s\n"
+                    "ldr q29, [c_ptr3, #0x10]\n"
+                    "mul v26.4s, v26.4s, v15.4s\n"
+                    "ldr q30, [c_ptr3, #0x20]\n"
+                    "mul v27.4s, v27.4s, v15.4s\n"
+                    "ldr q31, [c_ptr3, #0x30]\n"
+                    "mul v28.4s, v28.4s, v15.4s\n"
+                    "ldr q0, [%[a_ptr0]]\n"
+                    "mul v29.4s, v29.4s, v15.4s\n"
+                    "ldr q1, [a_ptr1]\n"
+                    "mul v30.4s, v30.4s, v15.4s\n"
+                    "ldr q2, [a_ptr2]\n"
+                    "mul v31.4s, v31.4s, v15.4s\n"
+                    "ldr q3, [a_ptr3]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    "cbz %[loops], 2f\n"
+                    "3:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "subs %[loops], %[loops], #0x1\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x20\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x20\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x20\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    "ldr q0, [%[a_ptr0], #-0x10]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    "ldr q1, [a_ptr1, #-0x10]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    "ldr q2, [a_ptr2, #-0x10]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    "ldr q3, [a_ptr3, #-0x10]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b.ne 3b\n"
+                    "2:\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                    "prfm PSTL1KEEP, [c_ptr1]\n"
+                    "prfm PSTL1KEEP, [c_ptr2]\n"
+                    "prfm PSTL1KEEP, [c_ptr3]\n"
+                    "cbz %[regs], 4f\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr q4, [%[a_ptr0]]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "ldr q5, [a_ptr1]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "ldr q6, [a_ptr2]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q7, [a_ptr3]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x10\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x10\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x10\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    "ldr q8, [%[b_ptr0], #-0x80]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    "ldr q9, [%[b_ptr0], #-0x70]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    "ldr q10, [%[b_ptr0], #-0x60]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    "ldr q11, [%[b_ptr0], #-0x50]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    "ldr q12, [%[b_ptr0], #-0x40]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    "ldr q13, [%[b_ptr0], #-0x30]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    "ldr q14, [%[b_ptr0], #-0x20]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "ldr q15, [%[b_ptr0], #-0x10]\n"
+                    ".word 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
+                    ".word 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
+                    ".word 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
+                    ".word 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
+                    ".word 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
+                    ".word 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
+                    ".word 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
+                    ".word 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
+                    ".word 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
+                    ".word 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
+                    ".word 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
+                    ".word 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
+                    ".word 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
+                    ".word 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
+                    ".word 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
+                    ".word 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
+                    ".word 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
+                    ".word 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
+                    ".word 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
+                    ".word 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
+                    ".word 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
+                    ".word 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
+                    ".word 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
+                    ".word 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
+                    ".word 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
+                    ".word 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
+                    ".word 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
+                    ".word 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
+                    ".word 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
+                    ".word 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
+                    ".word 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
+                    ".word 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
+                    ".word 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
+                    ".word 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
+                    ".word 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
+                    "b 5f\n"
+                    "4:\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
+                    "ldr q12, [%[b_ptr0], #0x40]\n"
+                    ".word 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
+                    "ldr q13, [%[b_ptr0], #0x50]\n"
+                    ".word 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
+                    "ldr q14, [%[b_ptr0], #0x60]\n"
+                    ".word 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
+                    ".word 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
+                    ".word 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
+                    ".word 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
+                    "ldr q15, [%[b_ptr0], #0x70]\n"
+                    ".word 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
+                    ".word 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
+                    ".word 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
+                    ".word 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
+                    ".word 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
+                    ".word 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
+                    ".word 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
+                    ".word 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
+                    ".word 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
+                    ".word 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
+                    ".word 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
+                    ".word 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
+                    ".word 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
+                    ".word 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
+                    ".word 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
+                    ".word 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
+                    ".word 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
+                    ".word 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
+                    ".word 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
+                    ".word 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
+                    ".word 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
+                    "5:\n"
+                    "cbz %[blocks], 6f\n"
+                    "7:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "subs %[blocks], %[blocks], #0x1\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr s0, [%[a_ptr0]]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    "ldr s1, [a_ptr1]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    "add a_ptr1, a_ptr1, #0x4\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    "ldr s2, [a_ptr2]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    "add a_ptr2, a_ptr2, #0x4\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    "ldr s3, [a_ptr3]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    "add a_ptr3, a_ptr3, #0x4\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "b.ne 7b\n"
+                    "6:\n"
+                    "cbz %[odds], 8f\n"
+                    "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[0], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[0], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[0], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
+                    "ld1 {v1.b}[1], [a_ptr1], #1\n"
+                    "ld1 {v2.b}[1], [a_ptr2], #1\n"
+                    "ld1 {v3.b}[1], [a_ptr3], #1\n"
+                    "subs %[odds], %[odds], #0x1\n"
+                    "b.eq 9f\n"
+                    "ld1 {v0.b}[2], [%[a_ptr0]]\n"
+                    "ld1 {v1.b}[2], [a_ptr1]\n"
+                    "ld1 {v2.b}[2], [a_ptr2]\n"
+                    "ld1 {v3.b}[2], [a_ptr3]\n"
+                    "9:\n"
+                    "ldr q8, [%[b_ptr0]]\n"
+                    "ldr q9, [%[b_ptr0], #0x10]\n"
+                    "ldr q10, [%[b_ptr0], #0x20]\n"
+                    "ldr q11, [%[b_ptr0], #0x30]\n"
+                    ".word 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+                    ".word 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+                    ".word 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+                    ".word 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+                    ".word 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+                    ".word 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+                    ".word 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+                    ".word 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+                    ".word 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+                    ".word 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+                    ".word 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+                    ".word 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+                    ".word 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
+                    ".word 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
+                    ".word 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
+                    ".word 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
+                    "8:\n"
+                    "str q16, [%[c_ptr0]]\n"
+                    "str q17, [%[c_ptr0], #0x10]\n"
+                    "str q18, [%[c_ptr0], #0x20]\n"
+                    "str q19, [%[c_ptr0], #0x30]\n"
+                    "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                    "str q20, [c_ptr1]\n"
+                    "str q21, [c_ptr1, #0x10]\n"
+                    "str q22, [c_ptr1, #0x20]\n"
+                    "str q23, [c_ptr1, #0x30]\n"
+                    "str q24, [c_ptr2]\n"
+                    "str q25, [c_ptr2, #0x10]\n"
+                    "str q26, [c_ptr2, #0x20]\n"
+                    "str q27, [c_ptr2, #0x30]\n"
+                    "str q28, [c_ptr3]\n"
+                    "str q29, [c_ptr3, #0x10]\n"
+                    "str q30, [c_ptr3, #0x20]\n"
+                    "str q31, [c_ptr3, #0x30]\n"
+                    ".unreq a_ptr1\n"
+                    ".unreq a_ptr2\n"
+                    ".unreq a_ptr3\n"
+                    ".unreq c_ptr1\n"
+                    ".unreq c_ptr2\n"
+                    ".unreq c_ptr3\n"
+                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
+                    : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
             }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
         }
     }
 }
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000..c6895a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+class hybrid_fp16_mla_4VLx4
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, __fp16, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<__fp16>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+
+    hybrid_fp16_mla_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..ab41fb3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3681 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const int K_stride = K;
+    const long loops_count = ((K + 8) / 16) - 1;
+    K -= loops_count * 16;
+    const long regs_count = (K / 8) - 1;
+    K -= (regs_count + 1) * 8;
+    const long leftovers = K;
+
+    for (int y=0; y<M; y+=4) {
+        const __fp16 * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(__fp16);
+
+        __fp16 *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(__fp16);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+            const __fp16 *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = leftovers;
+            const __fp16 *a_ptr0 = a_ptr0_base;
+            const __fp16 *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.h, #0\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.h, #0\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.h, #0\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z27.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.h, #0\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.h, #0\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.h, #0\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.h, #0\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z28.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.h, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.h, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1h z28.h, p0/z, [c_ptr3]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "fmul z28.h, p7/m, z28.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z29.h, p7/m, z29.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z30.h, p7/m, z30.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z31.h, p7/m, z31.h, z15.h\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "b.eq 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "b.eq 5f\n"
+                        "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "b.eq 5f\n"
+                        "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1h z28.h, p0, [c_ptr3]\n"
+                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
new file mode 100644
index 0000000..ffd7918
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+class hybrid_s8s32_dot_4VLx4
+{
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<int32_t>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+
+    hybrid_s8s32_dot_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..673f186
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long leftovers = K;
+    const long blocks_count = (K + 3) / 4;
+
+    for (int y=0; y<M; y+=4) {
+        const int8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(int8_t);
+
+        int32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(int32_t);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
+            const int32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            const int8_t *a_ptr0 = a_ptr0_base;
+            const int8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z28.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "mul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "sdot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "sdot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z26.s, z10.b, z2.b[0]\n"
+                        "sdot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z23.s, z11.b, z1.b[0]\n"
+                        "sdot z27.s, z11.b, z2.b[0]\n"
+                        "sdot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z24.s, z12.b, z2.b[1]\n"
+                        "sdot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "sdot z17.s, z13.b, z0.b[1]\n"
+                        "sdot z21.s, z13.b, z1.b[1]\n"
+                        "sdot z25.s, z13.b, z2.b[1]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z26.s, z14.b, z2.b[1]\n"
+                        "sdot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z27.s, z15.b, z2.b[1]\n"
+                        "sdot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z24.s, z8.b, z2.b[2]\n"
+                        "sdot z28.s, z8.b, z3.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z21.s, z9.b, z1.b[2]\n"
+                        "sdot z25.s, z9.b, z2.b[2]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z26.s, z10.b, z2.b[2]\n"
+                        "sdot z30.s, z10.b, z3.b[2]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "sdot z27.s, z11.b, z2.b[2]\n"
+                        "sdot z31.s, z11.b, z3.b[2]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z24.s, z12.b, z2.b[3]\n"
+                        "sdot z28.s, z12.b, z3.b[3]\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z21.s, z13.b, z1.b[3]\n"
+                        "sdot z25.s, z13.b, z2.b[3]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z26.s, z14.b, z2.b[3]\n"
+                        "sdot z30.s, z14.b, z3.b[3]\n"
+                        "sdot z19.s, z15.b, z0.b[3]\n"
+                        "sdot z23.s, z15.b, z1.b[3]\n"
+                        "sdot z27.s, z15.b, z2.b[3]\n"
+                        "sdot z31.s, z15.b, z3.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "sdot z24.s, z8.b, z6.b[0]\n"
+                        "sdot z28.s, z8.b, z7.b[0]\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
+                        "sdot z21.s, z9.b, z5.b[0]\n"
+                        "sdot z25.s, z9.b, z6.b[0]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z26.s, z10.b, z6.b[0]\n"
+                        "sdot z30.s, z10.b, z7.b[0]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "sdot z27.s, z11.b, z6.b[0]\n"
+                        "sdot z31.s, z11.b, z7.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "sdot z24.s, z12.b, z6.b[1]\n"
+                        "sdot z28.s, z12.b, z7.b[1]\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z21.s, z13.b, z5.b[1]\n"
+                        "sdot z25.s, z13.b, z6.b[1]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z26.s, z14.b, z6.b[1]\n"
+                        "sdot z30.s, z14.b, z7.b[1]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "sdot z27.s, z15.b, z6.b[1]\n"
+                        "sdot z31.s, z15.b, z7.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z24.s, z8.b, z6.b[2]\n"
+                        "sdot z28.s, z8.b, z7.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z21.s, z9.b, z5.b[2]\n"
+                        "sdot z25.s, z9.b, z6.b[2]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z26.s, z10.b, z6.b[2]\n"
+                        "sdot z30.s, z10.b, z7.b[2]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "sdot z27.s, z11.b, z6.b[2]\n"
+                        "sdot z31.s, z11.b, z7.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "sdot z24.s, z12.b, z6.b[3]\n"
+                        "sdot z28.s, z12.b, z7.b[3]\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z21.s, z13.b, z5.b[3]\n"
+                        "sdot z25.s, z13.b, z6.b[3]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "sdot z26.s, z14.b, z6.b[3]\n"
+                        "sdot z30.s, z14.b, z7.b[3]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "sdot z27.s, z15.b, z6.b[3]\n"
+                        "sdot z31.s, z15.b, z7.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
new file mode 100644
index 0000000..2701a9e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+class hybrid_u8u32_dot_4VLx4
+{
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, uint32_t, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<uint32_t>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 4;
+    }
+
+    StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
+
+    // Default to the generic kernel
+    kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+
+    hybrid_u8u32_dot_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
new file mode 100644
index 0000000..d34d0e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
@@ -0,0 +1,2150 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, uint32_t beta, int M, int N, int K) {
+    const long beta0 = (beta == 0u);
+    const int K_stride = ((K + 3) / 4) * 4;
+    const long loops_count = ((K + 16) / 32) - 1;
+    K -= loops_count * 32;
+    const long regs_count = (K / 16) - 1;
+    K -= (regs_count + 1) * 16;
+    const long leftovers = K;
+    const long blocks_count = (K + 3) / 4;
+
+    for (int y=0; y<M; y+=4) {
+        const uint8_t * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(uint8_t);
+
+        uint32_t *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(uint32_t);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
+            const uint32_t *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            const uint8_t *a_ptr0 = a_ptr0_base;
+            const uint8_t *b_ptr0 = B + (K_stride * x0);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.b, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.b\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p2.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "whilelt p3.s, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.s, #0\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.s, #0\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mov z18.s, #0\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mov z19.s, #0\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "mov z20.s, #0\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "mov z21.s, #0\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.s, #0\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.s, #0\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.s, #0\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "mov z25.s, #0\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "mov z27.s, #0\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "mov z28.s, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z29.s, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z30.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z31.s, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rw z15.s, p7/z, [%[betaptr]]\n"
+                        "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "mul z16.s, p7/m, z16.s, z15.s\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
+                        "mul z17.s, p7/m, z17.s, z15.s\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "mul z18.s, p7/m, z18.s, z15.s\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "mul z19.s, p7/m, z19.s, z15.s\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "mul z20.s, p7/m, z20.s, z15.s\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "mul z21.s, p7/m, z21.s, z15.s\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mul z22.s, p7/m, z22.s, z15.s\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "mul z24.s, p7/m, z24.s, z15.s\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "mul z25.s, p7/m, z25.s, z15.s\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "mul z26.s, p7/m, z26.s, z15.s\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "mul z28.s, p7/m, z28.s, z15.s\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "mul z29.s, p7/m, z29.s, z15.s\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "mul z30.s, p7/m, z30.s, z15.s\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #8\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z8.b, z1.b[0]\n"
+                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
+                        "udot z28.s, z8.b, z3.b[0]\n"
+                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "udot z29.s, z9.b, z3.b[0]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z26.s, z10.b, z2.b[0]\n"
+                        "udot z30.s, z10.b, z3.b[0]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z23.s, z11.b, z1.b[0]\n"
+                        "udot z27.s, z11.b, z2.b[0]\n"
+                        "udot z31.s, z11.b, z3.b[0]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z24.s, z12.b, z2.b[1]\n"
+                        "udot z28.s, z12.b, z3.b[1]\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "udot z17.s, z13.b, z0.b[1]\n"
+                        "udot z21.s, z13.b, z1.b[1]\n"
+                        "udot z25.s, z13.b, z2.b[1]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z26.s, z14.b, z2.b[1]\n"
+                        "udot z30.s, z14.b, z3.b[1]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z27.s, z15.b, z2.b[1]\n"
+                        "udot z31.s, z15.b, z3.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z24.s, z8.b, z2.b[2]\n"
+                        "udot z28.s, z8.b, z3.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z21.s, z9.b, z1.b[2]\n"
+                        "udot z25.s, z9.b, z2.b[2]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z26.s, z10.b, z2.b[2]\n"
+                        "udot z30.s, z10.b, z3.b[2]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "udot z27.s, z11.b, z2.b[2]\n"
+                        "udot z31.s, z11.b, z3.b[2]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z24.s, z12.b, z2.b[3]\n"
+                        "udot z28.s, z12.b, z3.b[3]\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z21.s, z13.b, z1.b[3]\n"
+                        "udot z25.s, z13.b, z2.b[3]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z26.s, z14.b, z2.b[3]\n"
+                        "udot z30.s, z14.b, z3.b[3]\n"
+                        "udot z19.s, z15.b, z0.b[3]\n"
+                        "udot z23.s, z15.b, z1.b[3]\n"
+                        "udot z27.s, z15.b, z2.b[3]\n"
+                        "udot z31.s, z15.b, z3.b[3]\n"
+                        "cbz %[blocks], 5f\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "udot z24.s, z8.b, z6.b[0]\n"
+                        "udot z28.s, z8.b, z7.b[0]\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
+                        "udot z21.s, z9.b, z5.b[0]\n"
+                        "udot z25.s, z9.b, z6.b[0]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z26.s, z10.b, z6.b[0]\n"
+                        "udot z30.s, z10.b, z7.b[0]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "udot z27.s, z11.b, z6.b[0]\n"
+                        "udot z31.s, z11.b, z7.b[0]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "udot z24.s, z12.b, z6.b[1]\n"
+                        "udot z28.s, z12.b, z7.b[1]\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z21.s, z13.b, z5.b[1]\n"
+                        "udot z25.s, z13.b, z6.b[1]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z26.s, z14.b, z6.b[1]\n"
+                        "udot z30.s, z14.b, z7.b[1]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "udot z27.s, z15.b, z6.b[1]\n"
+                        "udot z31.s, z15.b, z7.b[1]\n"
+                        "b.eq 5f\n"
+                        "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z24.s, z8.b, z6.b[2]\n"
+                        "udot z28.s, z8.b, z7.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z21.s, z9.b, z5.b[2]\n"
+                        "udot z25.s, z9.b, z6.b[2]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z26.s, z10.b, z6.b[2]\n"
+                        "udot z30.s, z10.b, z7.b[2]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "udot z27.s, z11.b, z6.b[2]\n"
+                        "udot z31.s, z11.b, z7.b[2]\n"
+                        "b.eq 5f\n"
+                        "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "udot z24.s, z12.b, z6.b[3]\n"
+                        "udot z28.s, z12.b, z7.b[3]\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z21.s, z13.b, z5.b[3]\n"
+                        "udot z25.s, z13.b, z6.b[3]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "udot z26.s, z14.b, z6.b[3]\n"
+                        "udot z30.s, z14.b, z7.b[3]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "udot z27.s, z15.b, z6.b[3]\n"
+                        "udot z31.s, z15.b, z7.b[3]\n"
+                        "5:\n"
+                        "st1w z16.s, p0, [%[c_ptr0]]\n"
+                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1w z20.s, p0, [c_ptr1]\n"
+                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1w z24.s, p0, [c_ptr2]\n"
+                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1w z28.s, p0, [c_ptr3]\n"
+                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
index f4d33a9..8228df4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
new file mode 100644
index 0000000..6cce601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_native_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+class native_fp16_mla_4VLx4
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, __fp16, int, int, int);
+
+    /* Kernel blocking parameters */
+    static unsigned int out_height()
+    {
+        return 4;
+    }
+
+    static unsigned int out_width()
+    {
+        return get_vector_length<__fp16>() * 4;
+    }
+
+    static unsigned int k_unroll()
+    {
+        return 1;
+    }
+
+
+
+    // Default to the generic kernel
+    kern_type kernel=sve_native_fp16_mla_4VLx4;
+
+    native_fp16_mla_4VLx4(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
new file mode 100644
index 0000000..f1aaeb1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
@@ -0,0 +1,3821 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ldb, __fp16 *C, int ldc, __fp16 beta, int M, int N, int K) {
+    const long beta0 = (beta == 0.0f);
+    const long loops_count = ((K + 8) / 16) - 1;
+    K -= loops_count * 16;
+    const long regs_count = (K / 8) - 1;
+    K -= (regs_count + 1) * 8;
+    const long leftovers = K;
+
+    for (int y=0; y<M; y+=4) {
+        const __fp16 * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(__fp16);
+
+        __fp16 *c_ptr0 = C + (y * ldc);
+        const unsigned long ldcb = ldc * sizeof(__fp16);
+
+        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
+            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
+            const __fp16 *betaptr = &beta;
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = leftovers;
+            const __fp16 *a_ptr0 = a_ptr0_base;
+            const __fp16 *b_ptr0 = B + x0;
+            long ldbb = ldb * sizeof(__fp16);
+
+            switch(M-y) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z23.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z24.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z25.h, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z26.h, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z27.h, #0\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.h, %[temp], %[leftovers]\n"
+                        "whilelt p0.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "ptrue p7.h\n"
+                        "whilelt p1.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p2.h, %[temp], %[width]\n"
+                        "inch %[temp], all, mul #1\n"
+                        "whilelt p3.h, %[temp], %[width]\n"
+                        "cbz %[beta0], 1f\n"
+                        "mov z16.h, #0\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "mov z17.h, #0\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "mov z18.h, #0\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "mov z19.h, #0\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "mov z20.h, #0\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z21.h, #0\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z22.h, #0\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "mov z23.h, #0\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z24.h, #0\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov z25.h, #0\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov z26.h, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov z27.h, #0\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z28.h, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "mov z29.h, #0\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "mov z30.h, #0\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z31.h, #0\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1rh z15.h, p7/z, [%[betaptr]]\n"
+                        "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
+                        "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
+                        "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
+                        "fmul z16.h, p7/m, z16.h, z15.h\n"
+                        "ld1h z20.h, p0/z, [c_ptr1]\n"
+                        "fmul z17.h, p7/m, z17.h, z15.h\n"
+                        "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "fmul z18.h, p7/m, z18.h, z15.h\n"
+                        "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "fmul z19.h, p7/m, z19.h, z15.h\n"
+                        "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "fmul z20.h, p7/m, z20.h, z15.h\n"
+                        "ld1h z24.h, p0/z, [c_ptr2]\n"
+                        "fmul z21.h, p7/m, z21.h, z15.h\n"
+                        "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "fmul z22.h, p7/m, z22.h, z15.h\n"
+                        "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "fmul z23.h, p7/m, z23.h, z15.h\n"
+                        "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "fmul z24.h, p7/m, z24.h, z15.h\n"
+                        "ld1h z28.h, p0/z, [c_ptr3]\n"
+                        "fmul z25.h, p7/m, z25.h, z15.h\n"
+                        "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "fmul z26.h, p7/m, z26.h, z15.h\n"
+                        "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "fmul z27.h, p7/m, z27.h, z15.h\n"
+                        "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "fmul z28.h, p7/m, z28.h, z15.h\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
+                        "fmul z29.h, p7/m, z29.h, z15.h\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
+                        "fmul z30.h, p7/m, z30.h, z15.h\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
+                        "fmul z31.h, p7/m, z31.h, z15.h\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "fmla z16.h, z12.h, z4.h[7]\n"
+                        "fmla z20.h, z12.h, z5.h[7]\n"
+                        "fmla z24.h, z12.h, z6.h[7]\n"
+                        "fmla z28.h, z12.h, z7.h[7]\n"
+                        "fmla z17.h, z13.h, z4.h[7]\n"
+                        "fmla z21.h, z13.h, z5.h[7]\n"
+                        "fmla z25.h, z13.h, z6.h[7]\n"
+                        "fmla z29.h, z13.h, z7.h[7]\n"
+                        "fmla z18.h, z14.h, z4.h[7]\n"
+                        "fmla z22.h, z14.h, z5.h[7]\n"
+                        "fmla z26.h, z14.h, z6.h[7]\n"
+                        "fmla z30.h, z14.h, z7.h[7]\n"
+                        "fmla z19.h, z15.h, z4.h[7]\n"
+                        "fmla z23.h, z15.h, z5.h[7]\n"
+                        "fmla z27.h, z15.h, z6.h[7]\n"
+                        "fmla z31.h, z15.h, z7.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla z16.h, z8.h, z0.h[0]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z20.h, z8.h, z1.h[0]\n"
+                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
+                        "fmla z24.h, z8.h, z2.h[0]\n"
+                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
+                        "fmla z28.h, z8.h, z3.h[0]\n"
+                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
+                        "fmla z17.h, z9.h, z0.h[0]\n"
+                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
+                        "fmla z21.h, z9.h, z1.h[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z25.h, z9.h, z2.h[0]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z29.h, z9.h, z3.h[0]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[0]\n"
+                        "fmla z22.h, z10.h, z1.h[0]\n"
+                        "fmla z26.h, z10.h, z2.h[0]\n"
+                        "fmla z30.h, z10.h, z3.h[0]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[0]\n"
+                        "fmla z23.h, z11.h, z1.h[0]\n"
+                        "fmla z27.h, z11.h, z2.h[0]\n"
+                        "fmla z31.h, z11.h, z3.h[0]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[1]\n"
+                        "fmla z24.h, z12.h, z2.h[1]\n"
+                        "fmla z28.h, z12.h, z3.h[1]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[1]\n"
+                        "fmla z21.h, z13.h, z1.h[1]\n"
+                        "fmla z25.h, z13.h, z2.h[1]\n"
+                        "fmla z29.h, z13.h, z3.h[1]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[1]\n"
+                        "fmla z22.h, z14.h, z1.h[1]\n"
+                        "fmla z26.h, z14.h, z2.h[1]\n"
+                        "fmla z30.h, z14.h, z3.h[1]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[1]\n"
+                        "fmla z23.h, z15.h, z1.h[1]\n"
+                        "fmla z27.h, z15.h, z2.h[1]\n"
+                        "fmla z31.h, z15.h, z3.h[1]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[2]\n"
+                        "fmla z24.h, z8.h, z2.h[2]\n"
+                        "fmla z28.h, z8.h, z3.h[2]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[2]\n"
+                        "fmla z21.h, z9.h, z1.h[2]\n"
+                        "fmla z25.h, z9.h, z2.h[2]\n"
+                        "fmla z29.h, z9.h, z3.h[2]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[2]\n"
+                        "fmla z22.h, z10.h, z1.h[2]\n"
+                        "fmla z26.h, z10.h, z2.h[2]\n"
+                        "fmla z30.h, z10.h, z3.h[2]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[2]\n"
+                        "fmla z23.h, z11.h, z1.h[2]\n"
+                        "fmla z27.h, z11.h, z2.h[2]\n"
+                        "fmla z31.h, z11.h, z3.h[2]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[3]\n"
+                        "fmla z24.h, z12.h, z2.h[3]\n"
+                        "fmla z28.h, z12.h, z3.h[3]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[3]\n"
+                        "fmla z21.h, z13.h, z1.h[3]\n"
+                        "fmla z25.h, z13.h, z2.h[3]\n"
+                        "fmla z29.h, z13.h, z3.h[3]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[3]\n"
+                        "fmla z22.h, z14.h, z1.h[3]\n"
+                        "fmla z26.h, z14.h, z2.h[3]\n"
+                        "fmla z30.h, z14.h, z3.h[3]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[3]\n"
+                        "fmla z23.h, z15.h, z1.h[3]\n"
+                        "fmla z27.h, z15.h, z2.h[3]\n"
+                        "fmla z31.h, z15.h, z3.h[3]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[4]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z8.h, z1.h[4]\n"
+                        "fmla z24.h, z8.h, z2.h[4]\n"
+                        "fmla z28.h, z8.h, z3.h[4]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z9.h, z0.h[4]\n"
+                        "fmla z21.h, z9.h, z1.h[4]\n"
+                        "fmla z25.h, z9.h, z2.h[4]\n"
+                        "fmla z29.h, z9.h, z3.h[4]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z10.h, z0.h[4]\n"
+                        "fmla z22.h, z10.h, z1.h[4]\n"
+                        "fmla z26.h, z10.h, z2.h[4]\n"
+                        "fmla z30.h, z10.h, z3.h[4]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z11.h, z0.h[4]\n"
+                        "fmla z23.h, z11.h, z1.h[4]\n"
+                        "fmla z27.h, z11.h, z2.h[4]\n"
+                        "fmla z31.h, z11.h, z3.h[4]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z0.h[5]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "fmla z20.h, z12.h, z1.h[5]\n"
+                        "fmla z24.h, z12.h, z2.h[5]\n"
+                        "fmla z28.h, z12.h, z3.h[5]\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "fmla z17.h, z13.h, z0.h[5]\n"
+                        "fmla z21.h, z13.h, z1.h[5]\n"
+                        "fmla z25.h, z13.h, z2.h[5]\n"
+                        "fmla z29.h, z13.h, z3.h[5]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "fmla z18.h, z14.h, z0.h[5]\n"
+                        "fmla z22.h, z14.h, z1.h[5]\n"
+                        "fmla z26.h, z14.h, z2.h[5]\n"
+                        "fmla z30.h, z14.h, z3.h[5]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "fmla z19.h, z15.h, z0.h[5]\n"
+                        "fmla z23.h, z15.h, z1.h[5]\n"
+                        "fmla z27.h, z15.h, z2.h[5]\n"
+                        "fmla z31.h, z15.h, z3.h[5]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z0.h[6]\n"
+                        "fmla z20.h, z8.h, z1.h[6]\n"
+                        "fmla z24.h, z8.h, z2.h[6]\n"
+                        "fmla z28.h, z8.h, z3.h[6]\n"
+                        "fmla z17.h, z9.h, z0.h[6]\n"
+                        "fmla z21.h, z9.h, z1.h[6]\n"
+                        "fmla z25.h, z9.h, z2.h[6]\n"
+                        "fmla z29.h, z9.h, z3.h[6]\n"
+                        "fmla z18.h, z10.h, z0.h[6]\n"
+                        "fmla z22.h, z10.h, z1.h[6]\n"
+                        "fmla z26.h, z10.h, z2.h[6]\n"
+                        "fmla z30.h, z10.h, z3.h[6]\n"
+                        "fmla z19.h, z11.h, z0.h[6]\n"
+                        "fmla z23.h, z11.h, z1.h[6]\n"
+                        "fmla z27.h, z11.h, z2.h[6]\n"
+                        "fmla z31.h, z11.h, z3.h[6]\n"
+                        "fmla z16.h, z12.h, z0.h[7]\n"
+                        "fmla z20.h, z12.h, z1.h[7]\n"
+                        "fmla z24.h, z12.h, z2.h[7]\n"
+                        "fmla z28.h, z12.h, z3.h[7]\n"
+                        "fmla z17.h, z13.h, z0.h[7]\n"
+                        "fmla z21.h, z13.h, z1.h[7]\n"
+                        "fmla z25.h, z13.h, z2.h[7]\n"
+                        "fmla z29.h, z13.h, z3.h[7]\n"
+                        "fmla z18.h, z14.h, z0.h[7]\n"
+                        "fmla z22.h, z14.h, z1.h[7]\n"
+                        "fmla z26.h, z14.h, z2.h[7]\n"
+                        "fmla z30.h, z14.h, z3.h[7]\n"
+                        "fmla z19.h, z15.h, z0.h[7]\n"
+                        "fmla z23.h, z15.h, z1.h[7]\n"
+                        "fmla z27.h, z15.h, z2.h[7]\n"
+                        "fmla z31.h, z15.h, z3.h[7]\n"
+                        "cbz %[blocks], 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[0]\n"
+                        "fmla z20.h, z8.h, z5.h[0]\n"
+                        "fmla z24.h, z8.h, z6.h[0]\n"
+                        "fmla z28.h, z8.h, z7.h[0]\n"
+                        "fmla z17.h, z9.h, z4.h[0]\n"
+                        "fmla z21.h, z9.h, z5.h[0]\n"
+                        "fmla z25.h, z9.h, z6.h[0]\n"
+                        "fmla z29.h, z9.h, z7.h[0]\n"
+                        "fmla z18.h, z10.h, z4.h[0]\n"
+                        "fmla z22.h, z10.h, z5.h[0]\n"
+                        "fmla z26.h, z10.h, z6.h[0]\n"
+                        "fmla z30.h, z10.h, z7.h[0]\n"
+                        "fmla z19.h, z11.h, z4.h[0]\n"
+                        "fmla z23.h, z11.h, z5.h[0]\n"
+                        "fmla z27.h, z11.h, z6.h[0]\n"
+                        "fmla z31.h, z11.h, z7.h[0]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[1]\n"
+                        "fmla z20.h, z12.h, z5.h[1]\n"
+                        "fmla z24.h, z12.h, z6.h[1]\n"
+                        "fmla z28.h, z12.h, z7.h[1]\n"
+                        "fmla z17.h, z13.h, z4.h[1]\n"
+                        "fmla z21.h, z13.h, z5.h[1]\n"
+                        "fmla z25.h, z13.h, z6.h[1]\n"
+                        "fmla z29.h, z13.h, z7.h[1]\n"
+                        "fmla z18.h, z14.h, z4.h[1]\n"
+                        "fmla z22.h, z14.h, z5.h[1]\n"
+                        "fmla z26.h, z14.h, z6.h[1]\n"
+                        "fmla z30.h, z14.h, z7.h[1]\n"
+                        "fmla z19.h, z15.h, z4.h[1]\n"
+                        "fmla z23.h, z15.h, z5.h[1]\n"
+                        "fmla z27.h, z15.h, z6.h[1]\n"
+                        "fmla z31.h, z15.h, z7.h[1]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[2]\n"
+                        "fmla z20.h, z8.h, z5.h[2]\n"
+                        "fmla z24.h, z8.h, z6.h[2]\n"
+                        "fmla z28.h, z8.h, z7.h[2]\n"
+                        "fmla z17.h, z9.h, z4.h[2]\n"
+                        "fmla z21.h, z9.h, z5.h[2]\n"
+                        "fmla z25.h, z9.h, z6.h[2]\n"
+                        "fmla z29.h, z9.h, z7.h[2]\n"
+                        "fmla z18.h, z10.h, z4.h[2]\n"
+                        "fmla z22.h, z10.h, z5.h[2]\n"
+                        "fmla z26.h, z10.h, z6.h[2]\n"
+                        "fmla z30.h, z10.h, z7.h[2]\n"
+                        "fmla z19.h, z11.h, z4.h[2]\n"
+                        "fmla z23.h, z11.h, z5.h[2]\n"
+                        "fmla z27.h, z11.h, z6.h[2]\n"
+                        "fmla z31.h, z11.h, z7.h[2]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[3]\n"
+                        "fmla z20.h, z12.h, z5.h[3]\n"
+                        "fmla z24.h, z12.h, z6.h[3]\n"
+                        "fmla z28.h, z12.h, z7.h[3]\n"
+                        "fmla z17.h, z13.h, z4.h[3]\n"
+                        "fmla z21.h, z13.h, z5.h[3]\n"
+                        "fmla z25.h, z13.h, z6.h[3]\n"
+                        "fmla z29.h, z13.h, z7.h[3]\n"
+                        "fmla z18.h, z14.h, z4.h[3]\n"
+                        "fmla z22.h, z14.h, z5.h[3]\n"
+                        "fmla z26.h, z14.h, z6.h[3]\n"
+                        "fmla z30.h, z14.h, z7.h[3]\n"
+                        "fmla z19.h, z15.h, z4.h[3]\n"
+                        "fmla z23.h, z15.h, z5.h[3]\n"
+                        "fmla z27.h, z15.h, z6.h[3]\n"
+                        "fmla z31.h, z15.h, z7.h[3]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[4]\n"
+                        "fmla z20.h, z8.h, z5.h[4]\n"
+                        "fmla z24.h, z8.h, z6.h[4]\n"
+                        "fmla z28.h, z8.h, z7.h[4]\n"
+                        "fmla z17.h, z9.h, z4.h[4]\n"
+                        "fmla z21.h, z9.h, z5.h[4]\n"
+                        "fmla z25.h, z9.h, z6.h[4]\n"
+                        "fmla z29.h, z9.h, z7.h[4]\n"
+                        "fmla z18.h, z10.h, z4.h[4]\n"
+                        "fmla z22.h, z10.h, z5.h[4]\n"
+                        "fmla z26.h, z10.h, z6.h[4]\n"
+                        "fmla z30.h, z10.h, z7.h[4]\n"
+                        "fmla z19.h, z11.h, z4.h[4]\n"
+                        "fmla z23.h, z11.h, z5.h[4]\n"
+                        "fmla z27.h, z11.h, z6.h[4]\n"
+                        "fmla z31.h, z11.h, z7.h[4]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z12.h, z4.h[5]\n"
+                        "fmla z20.h, z12.h, z5.h[5]\n"
+                        "fmla z24.h, z12.h, z6.h[5]\n"
+                        "fmla z28.h, z12.h, z7.h[5]\n"
+                        "fmla z17.h, z13.h, z4.h[5]\n"
+                        "fmla z21.h, z13.h, z5.h[5]\n"
+                        "fmla z25.h, z13.h, z6.h[5]\n"
+                        "fmla z29.h, z13.h, z7.h[5]\n"
+                        "fmla z18.h, z14.h, z4.h[5]\n"
+                        "fmla z22.h, z14.h, z5.h[5]\n"
+                        "fmla z26.h, z14.h, z6.h[5]\n"
+                        "fmla z30.h, z14.h, z7.h[5]\n"
+                        "fmla z19.h, z15.h, z4.h[5]\n"
+                        "fmla z23.h, z15.h, z5.h[5]\n"
+                        "fmla z27.h, z15.h, z6.h[5]\n"
+                        "fmla z31.h, z15.h, z7.h[5]\n"
+                        "b.eq 5f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
+                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "fmla z16.h, z8.h, z4.h[6]\n"
+                        "fmla z20.h, z8.h, z5.h[6]\n"
+                        "fmla z24.h, z8.h, z6.h[6]\n"
+                        "fmla z28.h, z8.h, z7.h[6]\n"
+                        "fmla z17.h, z9.h, z4.h[6]\n"
+                        "fmla z21.h, z9.h, z5.h[6]\n"
+                        "fmla z25.h, z9.h, z6.h[6]\n"
+                        "fmla z29.h, z9.h, z7.h[6]\n"
+                        "fmla z18.h, z10.h, z4.h[6]\n"
+                        "fmla z22.h, z10.h, z5.h[6]\n"
+                        "fmla z26.h, z10.h, z6.h[6]\n"
+                        "fmla z30.h, z10.h, z7.h[6]\n"
+                        "fmla z19.h, z11.h, z4.h[6]\n"
+                        "fmla z23.h, z11.h, z5.h[6]\n"
+                        "fmla z27.h, z11.h, z6.h[6]\n"
+                        "fmla z31.h, z11.h, z7.h[6]\n"
+                        "5:\n"
+                        "st1h z16.h, p0, [%[c_ptr0]]\n"
+                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
+                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
+                        "st1h z20.h, p0, [c_ptr1]\n"
+                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
+                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
+                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
+                        "st1h z24.h, p0, [c_ptr2]\n"
+                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
+                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
+                        "st1h z28.h, p0, [c_ptr3]\n"
+                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
+                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
+                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
index 9c02d95..abee1bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,63 +86,73 @@
                         "mov z19.s, #0\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "b 2f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "2:\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -530,33 +540,33 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -570,33 +580,33 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -610,33 +620,33 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -650,38 +660,38 @@
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -811,33 +821,33 @@
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -851,33 +861,33 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -891,33 +901,33 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -931,14 +941,14 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -971,103 +981,108 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z21.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z22.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z22.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z23.s, #0\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 2f\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "sdot z20.s, z12.b, z1.b[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@
                         "sdot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z21.s, z13.b, z5.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[3]\n"
+                        "sdot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
+                        "sdot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z4.b[1]\n"
+                        "sdot z23.s, z15.b, z5.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z4.b[2]\n"
+                        "sdot z20.s, z8.b, z5.b[2]\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z19.s, z11.b, z4.b[2]\n"
+                        "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z16.s, z12.b, z4.b[3]\n"
                         "sdot z20.s, z12.b, z5.b[3]\n"
                         "sdot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@
                         "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@
                         "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@
                         "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@
                         "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "sdot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "sdot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "sdot z19.s, z15.b, z0.b[1]\n"
+                        "sdot z23.s, z15.b, z1.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "sdot z16.s, z8.b, z0.b[2]\n"
+                        "sdot z20.s, z8.b, z1.b[2]\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z19.s, z11.b, z0.b[2]\n"
+                        "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z16.s, z12.b, z0.b[3]\n"
                         "sdot z20.s, z12.b, z1.b[3]\n"
                         "sdot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@
                         "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@
                         "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@
                         "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@
                         "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@
                         "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -2007,11 +2022,11 @@
                         "c_ptr2 .req X3\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
                         "whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z22.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z23.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z23.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z24.s, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z25.s, #0\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z25.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "mov z27.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z24.s, z8.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@
                         "sdot z21.s, z13.b, z5.b[3]\n"
                         "sdot z25.s, z13.b, z6.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z26.s, z14.b, z6.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
+                        "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z14.b, z1.b[3]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
+                        "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z5.b[0]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@
                         "sdot z16.s, z8.b, z4.b[2]\n"
                         "sdot z20.s, z8.b, z5.b[2]\n"
                         "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z18.s, z10.b, z4.b[2]\n"
+                        "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@
                         "sdot z19.s, z15.b, z0.b[1]\n"
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "sdot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "sdot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@
                         "sdot z16.s, z8.b, z0.b[2]\n"
                         "sdot z20.s, z8.b, z1.b[2]\n"
                         "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z18.s, z10.b, z0.b[2]\n"
+                        "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "sdot z19.s, z11.b, z0.b[2]\n"
                         "sdot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@
                         "sdot z19.s, z15.b, z0.b[3]\n"
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@
                         "sdot z19.s, z15.b, z4.b[3]\n"
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@
                         "sdot z19.s, z11.b, z4.b[2]\n"
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@
                         "sdot z19.s, z15.b, z4.b[1]\n"
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@
                         "sdot z19.s, z11.b, z4.b[0]\n"
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -3234,15 +3255,15 @@
                         "c_ptr3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z23.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z24.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z25.s, #0\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z26.s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "mov z27.s, #0\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z28.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z27.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z28.s, #0\n"
                         "mov z29.s, #0\n"
                         "mov z30.s, #0\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "mov z31.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
                         "mul z28.s, p7/m, z28.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z29.s, p7/m, z29.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z30.s, p7/m, z30.s, z15.s\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "sdot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "sdot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "sdot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "sdot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "sdot z23.s, z11.b, z1.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
                         "sdot z31.s, z11.b, z3.b[0]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "sdot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "sdot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "sdot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "sdot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z5.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
                         "sdot z30.s, z14.b, z7.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z5.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
                         "sdot z30.s, z10.b, z7.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@
                         "sdot z17.s, z13.b, z4.b[3]\n"
                         "sdot z21.s, z13.b, z5.b[3]\n"
                         "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[3]\n"
                         "sdot z22.s, z14.b, z5.b[3]\n"
                         "sdot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
                         "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "sdot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
                         "sdot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "sdot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
                         "sdot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
                         "sdot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@
                         "sdot z17.s, z13.b, z0.b[3]\n"
                         "sdot z21.s, z13.b, z1.b[3]\n"
                         "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z14.b, z2.b[3]\n"
                         "sdot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@
                         "sdot z17.s, z9.b, z4.b[0]\n"
                         "sdot z21.s, z9.b, z5.b[0]\n"
                         "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "sdot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "sdot z26.s, z10.b, z6.b[0]\n"
                         "sdot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@
                         "sdot z17.s, z13.b, z4.b[1]\n"
                         "sdot z21.s, z13.b, z5.b[1]\n"
                         "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z4.b[1]\n"
                         "sdot z22.s, z14.b, z5.b[1]\n"
                         "sdot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@
                         "sdot z17.s, z9.b, z4.b[2]\n"
                         "sdot z21.s, z9.b, z5.b[2]\n"
                         "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z29.s, z9.b, z7.b[2]\n"
                         "sdot z18.s, z10.b, z4.b[2]\n"
                         "sdot z22.s, z10.b, z5.b[2]\n"
                         "sdot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
                         "sdot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
                         "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
                         "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
                         "sdot z31.s, z11.b, z3.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
                         "sdot z31.s, z15.b, z3.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@
                         "sdot z23.s, z11.b, z1.b[2]\n"
                         "sdot z27.s, z11.b, z2.b[2]\n"
                         "sdot z31.s, z11.b, z3.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@
                         "sdot z23.s, z15.b, z1.b[1]\n"
                         "sdot z27.s, z15.b, z2.b[1]\n"
                         "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@
                         "sdot z23.s, z11.b, z1.b[0]\n"
                         "sdot z27.s, z11.b, z2.b[0]\n"
                         "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "sdot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "sdot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "sdot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "sdot z25.s, z9.b, z2.b[0]\n"
                         "sdot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "sdot z22.s, z10.b, z1.b[0]\n"
                         "sdot z26.s, z10.b, z2.b[0]\n"
                         "sdot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@
                         "sdot z17.s, z13.b, z0.b[1]\n"
                         "sdot z21.s, z13.b, z1.b[1]\n"
                         "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "sdot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "sdot z18.s, z14.b, z0.b[1]\n"
                         "sdot z22.s, z14.b, z1.b[1]\n"
                         "sdot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@
                         "sdot z17.s, z9.b, z0.b[2]\n"
                         "sdot z21.s, z9.b, z1.b[2]\n"
                         "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "sdot z29.s, z9.b, z3.b[2]\n"
                         "sdot z18.s, z10.b, z0.b[2]\n"
                         "sdot z22.s, z10.b, z1.b[2]\n"
                         "sdot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@
                         "sdot z23.s, z15.b, z1.b[3]\n"
                         "sdot z27.s, z15.b, z2.b[3]\n"
                         "sdot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
                         "sdot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
                         "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
                         "sdot z31.s, z11.b, z7.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@
                         "sdot z23.s, z15.b, z5.b[3]\n"
                         "sdot z27.s, z15.b, z6.b[3]\n"
                         "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@
                         "sdot z23.s, z11.b, z5.b[2]\n"
                         "sdot z27.s, z11.b, z6.b[2]\n"
                         "sdot z31.s, z11.b, z7.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@
                         "sdot z23.s, z15.b, z5.b[1]\n"
                         "sdot z27.s, z15.b, z6.b[1]\n"
                         "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@
                         "sdot z23.s, z11.b, z5.b[0]\n"
                         "sdot z27.s, z11.b, z6.b[0]\n"
                         "sdot z31.s, z11.b, z7.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
index 7d89948..cdcea59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,63 +86,73 @@
                         "mov z19.s, #0\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "b 2f\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "zip2 z11.b, z8.b, z9.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z9.b, z8.b, z9.b\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "2:\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
@@ -150,137 +160,137 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
@@ -289,118 +299,118 @@
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z0.b[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -427,15 +437,15 @@
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -449,15 +459,15 @@
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -471,13 +481,13 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -492,31 +502,31 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -530,33 +540,33 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -570,33 +580,33 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -610,33 +620,33 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -650,38 +660,38 @@
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z16.s, z12.b, z0.b[1]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
@@ -708,15 +718,15 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -730,15 +740,15 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -752,13 +762,13 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -773,31 +783,31 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -811,33 +821,33 @@
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -851,33 +861,33 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -891,33 +901,33 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -931,14 +941,14 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -971,103 +981,108 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z21.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z22.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z22.s, #0\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z23.s, #0\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 2f\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
                         "udot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "udot z20.s, z12.b, z1.b[1]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
@@ -1092,148 +1107,148 @@
                         "udot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z21.s, z13.b, z5.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z14.b, z5.b[3]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[3]\n"
+                        "udot z23.s, z15.b, z5.b[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -1245,13 +1260,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1261,142 +1276,142 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z0.b[3]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[3]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z20.s, z8.b, z5.b[0]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z11.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
+                        "udot z23.s, z11.b, z5.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z12.b, z4.b[1]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z5.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
+                        "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z4.b[1]\n"
+                        "udot z23.s, z15.b, z5.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z4.b[2]\n"
+                        "udot z20.s, z8.b, z5.b[2]\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z19.s, z11.b, z4.b[2]\n"
+                        "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z16.s, z12.b, z4.b[3]\n"
                         "udot z20.s, z12.b, z5.b[3]\n"
                         "udot z17.s, z13.b, z4.b[3]\n"
@@ -1405,15 +1420,15 @@
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1431,15 +1446,15 @@
                         "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1457,13 +1472,13 @@
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1482,31 +1497,31 @@
                         "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1524,33 +1539,33 @@
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1568,33 +1583,33 @@
                         "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1612,33 +1627,33 @@
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1656,8 +1671,8 @@
                         "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -1669,13 +1684,13 @@
                         "zip2 z8.b, z14.b, z12.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
@@ -1685,44 +1700,44 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "udot z16.s, z12.b, z0.b[1]\n"
                         "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "udot z20.s, z12.b, z1.b[1]\n"
+                        "zip2 z12.b, z10.b, z8.b\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "udot z19.s, z15.b, z0.b[1]\n"
+                        "udot z23.s, z15.b, z1.b[1]\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
+                        "udot z16.s, z8.b, z0.b[2]\n"
+                        "udot z20.s, z8.b, z1.b[2]\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
+                        "zip2 z8.b, z14.b, z12.b\n"
+                        "zip1 z14.b, z14.b, z12.b\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z19.s, z11.b, z0.b[2]\n"
+                        "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z16.s, z12.b, z0.b[3]\n"
                         "udot z20.s, z12.b, z1.b[3]\n"
                         "udot z17.s, z13.b, z0.b[3]\n"
@@ -1731,15 +1746,15 @@
                         "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -1757,15 +1772,15 @@
                         "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -1783,13 +1798,13 @@
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -1808,31 +1823,31 @@
                         "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1850,33 +1865,33 @@
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1894,33 +1909,33 @@
                         "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -1938,33 +1953,33 @@
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -1982,7 +1997,7 @@
                         "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -1995,7 +2010,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -2007,11 +2022,11 @@
                         "c_ptr2 .req X3\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
                         "whilelt p1.s, %[temp], %[width]\n"
@@ -2034,116 +2049,122 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z22.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z23.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z23.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z24.s, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z25.s, #0\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z25.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "mov z26.s, #0\n"
+                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "zip2 z9.b, z9.b, z10.b\n"
+                        "zip1 z10.b, z11.b, z12.b\n"
+                        "zip2 z11.b, z11.b, z12.b\n"
+                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "mov z27.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
+                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "2:\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
+                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z24.s, z8.b, z2.b[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "add a_ptr1, a_ptr1, #0x20\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2157,15 +2178,15 @@
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
@@ -2185,15 +2206,15 @@
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
@@ -2213,15 +2234,15 @@
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
@@ -2244,15 +2265,15 @@
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
@@ -2272,15 +2293,15 @@
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[1]\n"
@@ -2300,15 +2321,15 @@
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
@@ -2328,23 +2349,23 @@
                         "udot z21.s, z13.b, z5.b[3]\n"
                         "udot z25.s, z13.b, z6.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z26.s, z14.b, z6.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
+                        "b.ne 3b\n"
+                        "2:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -2353,24 +2374,24 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
@@ -2390,15 +2411,15 @@
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[1]\n"
@@ -2418,15 +2439,15 @@
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
@@ -2446,15 +2467,15 @@
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
+                        "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z14.b, z1.b[3]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z15.b, z0.b[3]\n"
@@ -2477,15 +2498,15 @@
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
+                        "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z5.b[0]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z4.b[0]\n"
@@ -2505,12 +2526,12 @@
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2523,17 +2544,17 @@
                         "udot z16.s, z8.b, z4.b[2]\n"
                         "udot z20.s, z8.b, z5.b[2]\n"
                         "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z18.s, z10.b, z4.b[2]\n"
+                        "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
@@ -2550,15 +2571,15 @@
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2580,15 +2601,15 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2610,13 +2631,13 @@
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -2639,31 +2660,31 @@
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2685,33 +2706,33 @@
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2733,33 +2754,33 @@
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -2781,33 +2802,33 @@
                         "udot z19.s, z15.b, z0.b[1]\n"
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -2829,8 +2850,8 @@
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -2839,24 +2860,24 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "udot z17.s, z9.b, z0.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
+                        "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "udot z18.s, z10.b, z0.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
@@ -2876,12 +2897,12 @@
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -2894,17 +2915,17 @@
                         "udot z16.s, z8.b, z0.b[2]\n"
                         "udot z20.s, z8.b, z1.b[2]\n"
                         "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z18.s, z10.b, z0.b[2]\n"
+                        "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "udot z19.s, z11.b, z0.b[2]\n"
                         "udot z23.s, z11.b, z1.b[2]\n"
@@ -2921,15 +2942,15 @@
                         "udot z19.s, z15.b, z0.b[3]\n"
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -2951,15 +2972,15 @@
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -2981,13 +3002,13 @@
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3010,31 +3031,31 @@
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3056,33 +3077,33 @@
                         "udot z19.s, z15.b, z4.b[3]\n"
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3104,33 +3125,33 @@
                         "udot z19.s, z11.b, z4.b[2]\n"
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -3152,33 +3173,33 @@
                         "udot z19.s, z15.b, z4.b[1]\n"
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -3200,7 +3221,7 @@
                         "udot z19.s, z11.b, z4.b[0]\n"
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -3219,7 +3240,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -3234,15 +3255,15 @@
                         "c_ptr3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p6.b, %[temp], %[leftovers]\n"
                         "whilelt p0.s, %[temp], %[width]\n"
                         "whilelt p4.b, %[temp], %[width]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
@@ -3265,77 +3286,80 @@
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "mov z23.s, #0\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z24.s, #0\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
+                        "mov z24.s, #0\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
                         "mov z25.s, #0\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z26.s, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z10.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "mov z27.s, #0\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "mov z28.s, #0\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "mov z26.s, #0\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "mov z27.s, #0\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "mov z28.s, #0\n"
                         "mov z29.s, #0\n"
                         "mov z30.s, #0\n"
+                        "zip2 z15.b, z12.b, z13.b\n"
+                        "zip1 z13.b, z12.b, z13.b\n"
                         "mov z31.s, #0\n"
-                        "b 2f\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
                         "1:\n"
                         "ld1rw z15.s, p7/z, [%[betaptr]]\n"
                         "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
                         "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
                         "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
-                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z16.s, p7/m, z16.s, z15.s\n"
-                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z20.s, p0/z, [c_ptr1]\n"
                         "mul z17.s, p7/m, z17.s, z15.s\n"
-                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
+                        "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
                         "mul z18.s, p7/m, z18.s, z15.s\n"
-                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
+                        "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
                         "mul z19.s, p7/m, z19.s, z15.s\n"
-                        "ld1w z24.s, p0/z, [c_ptr2]\n"
+                        "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
                         "mul z20.s, p7/m, z20.s, z15.s\n"
-                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z24.s, p0/z, [c_ptr2]\n"
                         "mul z21.s, p7/m, z21.s, z15.s\n"
-                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
+                        "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
                         "mul z22.s, p7/m, z22.s, z15.s\n"
-                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
+                        "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
                         "mul z23.s, p7/m, z23.s, z15.s\n"
-                        "ld1w z28.s, p0/z, [c_ptr3]\n"
+                        "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
                         "mul z24.s, p7/m, z24.s, z15.s\n"
-                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "ld1w z28.s, p0/z, [c_ptr3]\n"
                         "mul z25.s, p7/m, z25.s, z15.s\n"
-                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
+                        "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
                         "mul z26.s, p7/m, z26.s, z15.s\n"
-                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
+                        "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
                         "mul z27.s, p7/m, z27.s, z15.s\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
                         "mul z28.s, p7/m, z28.s, z15.s\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mul z29.s, p7/m, z29.s, z15.s\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
                         "mul z30.s, p7/m, z30.s, z15.s\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
                         "mul z31.s, p7/m, z31.s, z15.s\n"
+                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "add a_ptr1, a_ptr1, #0x10\n"
+                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "add a_ptr2, a_ptr2, #0x10\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "add a_ptr3, a_ptr3, #0x10\n"
@@ -3344,21 +3368,20 @@
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "zip1 z10.b, z10.b, z8.b\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "2:\n"
-                        "cbz %[loops], 3f\n"
-                        "4:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -3367,38 +3390,38 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "udot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                         "udot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
                         "udot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "udot z19.s, z11.b, z0.b[0]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "udot z23.s, z11.b, z1.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
                         "udot z31.s, z11.b, z3.b[0]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
@@ -3414,17 +3437,17 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "udot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3446,17 +3469,17 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "udot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3478,17 +3501,17 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "udot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3514,17 +3537,17 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "udot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3546,17 +3569,17 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z5.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
                         "udot z30.s, z14.b, z7.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3578,17 +3601,17 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z5.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
                         "udot z30.s, z10.b, z7.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3610,13 +3633,13 @@
                         "udot z17.s, z13.b, z4.b[3]\n"
                         "udot z21.s, z13.b, z5.b[3]\n"
                         "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z7.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[3]\n"
                         "udot z22.s, z14.b, z5.b[3]\n"
                         "udot z26.s, z14.b, z6.b[3]\n"
@@ -3626,11 +3649,11 @@
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
                         "udot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 4b\n"
-                        "3:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
-                        "cbz %[regs], 5f\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -3639,27 +3662,27 @@
                         "ld1rqb z5.b, p7/z, [a_ptr1]\n"
                         "udot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
                         "udot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "udot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3681,17 +3704,17 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
                         "udot z30.s, z14.b, z3.b[1]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3713,17 +3736,17 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
                         "udot z30.s, z10.b, z3.b[2]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3745,17 +3768,17 @@
                         "udot z17.s, z13.b, z0.b[3]\n"
                         "udot z21.s, z13.b, z1.b[3]\n"
                         "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[3]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[3]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z14.b, z1.b[3]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z14.b, z2.b[3]\n"
                         "udot z30.s, z14.b, z3.b[3]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
@@ -3781,17 +3804,17 @@
                         "udot z17.s, z9.b, z4.b[0]\n"
                         "udot z21.s, z9.b, z5.b[0]\n"
                         "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z9.b, z7.b[0]\n"
+                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z4.b[0]\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "udot z22.s, z10.b, z5.b[0]\n"
+                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
                         "udot z26.s, z10.b, z6.b[0]\n"
                         "udot z30.s, z10.b, z7.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -3813,13 +3836,13 @@
                         "udot z17.s, z13.b, z4.b[1]\n"
                         "udot z21.s, z13.b, z5.b[1]\n"
                         "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z7.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z4.b[1]\n"
                         "udot z22.s, z14.b, z5.b[1]\n"
                         "udot z26.s, z14.b, z6.b[1]\n"
@@ -3841,11 +3864,11 @@
                         "udot z17.s, z9.b, z4.b[2]\n"
                         "udot z21.s, z9.b, z5.b[2]\n"
                         "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z29.s, z9.b, z7.b[2]\n"
                         "udot z18.s, z10.b, z4.b[2]\n"
                         "udot z22.s, z10.b, z5.b[2]\n"
                         "udot z26.s, z10.b, z6.b[2]\n"
@@ -3870,15 +3893,15 @@
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
                         "udot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 6f\n"
+                        "cbz %[blocks], 5f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -3904,15 +3927,15 @@
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
                         "udot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 7f\n"
+                        "b.eq 6f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -3938,13 +3961,13 @@
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
                         "udot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 8f\n"
+                        "b.eq 7f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -3971,31 +3994,31 @@
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
                         "udot z31.s, z11.b, z3.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 9f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 10f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "11:\n"
+                        "b 11f\n"
+                        "10:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 12f\n"
-                        "10:\n"
+                        "b 11f\n"
+                        "9:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "12:\n"
+                        "11:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4021,33 +4044,33 @@
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
                         "udot z31.s, z15.b, z3.b[3]\n"
-                        "b 9f\n"
-                        "8:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "7:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 12f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 13f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "14:\n"
+                        "b 14f\n"
+                        "13:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 15f\n"
-                        "13:\n"
+                        "b 14f\n"
+                        "12:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "15:\n"
+                        "14:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4073,33 +4096,33 @@
                         "udot z23.s, z11.b, z1.b[2]\n"
                         "udot z27.s, z11.b, z2.b[2]\n"
                         "udot z31.s, z11.b, z3.b[2]\n"
-                        "b 9f\n"
-                        "7:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "6:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 15f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 16f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "17:\n"
+                        "b 17f\n"
+                        "16:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 18f\n"
-                        "16:\n"
+                        "b 17f\n"
+                        "15:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "18:\n"
+                        "17:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4125,33 +4148,33 @@
                         "udot z23.s, z15.b, z1.b[1]\n"
                         "udot z27.s, z15.b, z2.b[1]\n"
                         "udot z31.s, z15.b, z3.b[1]\n"
-                        "b 9f\n"
-                        "6:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "5:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 18f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 19f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 20f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "20:\n"
+                        "b 20f\n"
+                        "19:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 21f\n"
-                        "19:\n"
+                        "b 20f\n"
+                        "18:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "21:\n"
+                        "20:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4177,8 +4200,8 @@
                         "udot z23.s, z11.b, z1.b[0]\n"
                         "udot z27.s, z11.b, z2.b[0]\n"
                         "udot z31.s, z11.b, z3.b[0]\n"
-                        "b 9f\n"
-                        "5:\n"
+                        "b 8f\n"
+                        "4:\n"
                         "udot z16.s, z8.b, z0.b[0]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
                         "udot z20.s, z8.b, z1.b[0]\n"
@@ -4187,27 +4210,27 @@
                         "ld1rqb z5.b, p6/z, [a_ptr1]\n"
                         "udot z28.s, z8.b, z3.b[0]\n"
                         "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
                         "zip1 z14.b, z14.b, z12.b\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z17.s, z9.b, z0.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "zip2 z13.b, z13.b, z14.b\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z21.s, z9.b, z1.b[0]\n"
+                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
+                        "udot z25.s, z9.b, z2.b[0]\n"
                         "udot z29.s, z9.b, z3.b[0]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
+                        "udot z22.s, z10.b, z1.b[0]\n"
                         "udot z26.s, z10.b, z2.b[0]\n"
                         "udot z30.s, z10.b, z3.b[0]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
@@ -4229,13 +4252,13 @@
                         "udot z17.s, z13.b, z0.b[1]\n"
                         "udot z21.s, z13.b, z1.b[1]\n"
                         "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "zip1 z8.b, z9.b, z10.b\n"
                         "zip2 z9.b, z9.b, z10.b\n"
                         "zip1 z10.b, z11.b, z12.b\n"
                         "zip2 z11.b, z11.b, z12.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "udot z29.s, z13.b, z3.b[1]\n"
+                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "udot z18.s, z14.b, z0.b[1]\n"
                         "udot z22.s, z14.b, z1.b[1]\n"
                         "udot z26.s, z14.b, z2.b[1]\n"
@@ -4257,11 +4280,11 @@
                         "udot z17.s, z9.b, z0.b[2]\n"
                         "udot z21.s, z9.b, z1.b[2]\n"
                         "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
                         "zip1 z12.b, z13.b, z14.b\n"
                         "zip2 z13.b, z13.b, z14.b\n"
                         "zip1 z14.b, z15.b, z8.b\n"
                         "zip2 z15.b, z15.b, z8.b\n"
+                        "udot z29.s, z9.b, z3.b[2]\n"
                         "udot z18.s, z10.b, z0.b[2]\n"
                         "udot z22.s, z10.b, z1.b[2]\n"
                         "udot z26.s, z10.b, z2.b[2]\n"
@@ -4286,15 +4309,15 @@
                         "udot z23.s, z15.b, z1.b[3]\n"
                         "udot z27.s, z15.b, z2.b[3]\n"
                         "udot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 22f\n"
+                        "cbz %[blocks], 21f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
@@ -4320,15 +4343,15 @@
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
                         "udot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 23f\n"
+                        "b.eq 22f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
@@ -4354,13 +4377,13 @@
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
                         "udot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 24f\n"
+                        "b.eq 23f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
+                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
@@ -4387,31 +4410,31 @@
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
                         "udot z31.s, z11.b, z7.b[2]\n"
-                        "cbz %[odds], 9f\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 24f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 25f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "26:\n"
+                        "b 26f\n"
+                        "25:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 27f\n"
-                        "25:\n"
+                        "b 26f\n"
+                        "24:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "27:\n"
+                        "26:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4437,33 +4460,33 @@
                         "udot z23.s, z15.b, z5.b[3]\n"
                         "udot z27.s, z15.b, z6.b[3]\n"
                         "udot z31.s, z15.b, z7.b[3]\n"
-                        "b 9f\n"
-                        "24:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "23:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 27f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 28f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "29:\n"
+                        "b 29f\n"
+                        "28:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 30f\n"
-                        "28:\n"
+                        "b 29f\n"
+                        "27:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "30:\n"
+                        "29:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4489,33 +4512,33 @@
                         "udot z23.s, z11.b, z5.b[2]\n"
                         "udot z27.s, z11.b, z6.b[2]\n"
                         "udot z31.s, z11.b, z7.b[2]\n"
-                        "b 9f\n"
-                        "23:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "22:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 30f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 31f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "32:\n"
+                        "b 32f\n"
+                        "31:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 33f\n"
-                        "31:\n"
+                        "b 32f\n"
+                        "30:\n"
                         "mov z13.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z14.b, #0\n"
                         "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "33:\n"
+                        "32:\n"
                         "zip2 z15.b, z12.b, z13.b\n"
                         "zip1 z13.b, z12.b, z13.b\n"
                         "mov z12.b, #0\n"
@@ -4541,33 +4564,33 @@
                         "udot z23.s, z15.b, z5.b[1]\n"
                         "udot z27.s, z15.b, z6.b[1]\n"
                         "udot z31.s, z15.b, z7.b[1]\n"
-                        "b 9f\n"
-                        "22:\n"
-                        "cbz %[odds], 9f\n"
+                        "b 8f\n"
+                        "21:\n"
+                        "cbz %[odds], 8f\n"
+                        "subs %[odds], %[odds], #0x1\n"
+                        "b.eq 33f\n"
                         "subs %[odds], %[odds], #0x1\n"
                         "b.eq 34f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 35f\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
+                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "35:\n"
+                        "b 35f\n"
+                        "34:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
                         "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 36f\n"
-                        "34:\n"
+                        "b 35f\n"
+                        "33:\n"
                         "mov z9.b, #0\n"
                         "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
                         "mov z10.b, #0\n"
                         "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "36:\n"
+                        "35:\n"
                         "zip2 z11.b, z8.b, z9.b\n"
                         "zip1 z9.b, z8.b, z9.b\n"
                         "mov z8.b, #0\n"
@@ -4593,7 +4616,7 @@
                         "udot z23.s, z11.b, z5.b[0]\n"
                         "udot z27.s, z11.b, z6.b[0]\n"
                         "udot z31.s, z11.b, z7.b[0]\n"
-                        "9:\n"
+                        "8:\n"
                         "st1w z16.s, p0, [%[c_ptr0]]\n"
                         "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
@@ -4618,7 +4641,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [leftovers] "r" (leftovers), [lda] "r" (ldab), [ldc] "r" (ldcb), [ldb] "r" (ldbb)
+                        : [betaptr] "r" (betaptr), [width] "r" (width), [beta0] "r" (beta0), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
