COMPMID-1369: Revert accidental formatting of RSH's repo Pulled latest fixes from David's repo: commit f43ebe932c84083332b0b1a0348241b69dda63a7 Author: David Mansell <David.Mansell@arm.com> Date: Tue Jul 3 18:09:01 2018 +0100 Whitespace tidying, fixed comment in gemv_batched imported from ACL. Change-Id: Ie37a623f44e90d88072236cb853ac55ac82d5f51 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/138530 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: David Mansell <david.mansell@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>

commit: 5f707736413aeac77818c42838296966f8dc6761 [log] [tgz]
author: Anthony Barbier <anthony.barbier@arm.com> Tue Jul 03 16:22:02 2018 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Fri Nov 02 16:54:10 2018 +0000
tree: b829ed3243ea5f3085f288836132416c78bc2e72
parent: 7485d5a62685cb745ab50e970adb722cb71557ac [diff]
diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
index b3fcb33..38f51ae 100644
--- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp

@@ -31,21 +31,21 @@
 // used by the workaround.
 
 // "Correct" version
-#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
-#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
-#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCH(address)    "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address)  "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
 #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
 
 // Lee's uarchsim hack
-//#define ASM_PREFETCH(address)    "LDNP x20, x21, " address "\n"
+//#define ASM_PREFETCH(address)	"LDNP x20, x21, " address "\n"
 
 // No preload at all
 //#define ASM_PREFETCH(address) ""
 #else
 
 // "Correct" versions for AArch32
-#define ASM_PREFETCH(address) "PLD " address "\n"
-#define ASM_PREFETCHW(address) "PLDW " address "\n"
+#define ASM_PREFETCH(address)     "PLD " address "\n"
+#define ASM_PREFETCHW(address)    "PLDW " address "\n"
 
 #endif
 
@@ -53,76 +53,77 @@
  * Do some prefetches.
  */
 template <typename T>
-static inline void prefetch_6x(const T *pfp)
-{
-    __asm __volatile(
+static inline void prefetch_6x(const T *pfp) {
+    __asm __volatile (
         ASM_PREFETCH("[%[pfp]]")
         ASM_PREFETCH("[%[pfp], #64]")
         ASM_PREFETCH("[%[pfp], #128]")
         ASM_PREFETCH("[%[pfp], #192]")
         ASM_PREFETCH("[%[pfp], #256]")
         ASM_PREFETCH("[%[pfp], #320]")
-        :
-        : [pfp] "r"(pfp)
-        : "memory");
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
 }
 
 template <typename T>
-static inline void prefetch_5x(const T *pfp)
-{
-    __asm __volatile(
+static inline void prefetch_5x(const T *pfp) {
+    __asm __volatile (
         ASM_PREFETCH("[%[pfp]]")
         ASM_PREFETCH("[%[pfp], #64]")
         ASM_PREFETCH("[%[pfp], #128]")
         ASM_PREFETCH("[%[pfp], #192]")
         ASM_PREFETCH("[%[pfp], #256]")
-        :
-        : [pfp] "r"(pfp)
-        : "memory");
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
 }
 
 template <typename T>
-static inline void prefetch_4x(const T *pfp)
-{
-    __asm __volatile(
+static inline void prefetch_4x(const T *pfp) {
+    __asm __volatile (
         ASM_PREFETCH("[%[pfp]]")
         ASM_PREFETCH("[%[pfp], #64]")
         ASM_PREFETCH("[%[pfp], #128]")
         ASM_PREFETCH("[%[pfp], #192]")
-        :
-        : [pfp] "r"(pfp)
-        : "memory");
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
 }
 
 template <typename T>
-static inline void prefetch_3x(const T *pfp)
-{
-    __asm __volatile(
+static inline void prefetch_3x(const T *pfp) {
+    __asm __volatile (
         ASM_PREFETCH("[%[pfp]]")
         ASM_PREFETCH("[%[pfp], #64]")
         ASM_PREFETCH("[%[pfp], #128]")
-        :
-        : [pfp] "r"(pfp)
-        : "memory");
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
 }
 
 template <typename T>
-static inline void prefetch_2x(const T *pfp)
-{
-    __asm __volatile(
+static inline void prefetch_2x(const T *pfp) {
+    __asm __volatile (
         ASM_PREFETCH("[%[pfp]]")
         ASM_PREFETCH("[%[pfp], #64]")
-        :
-        : [pfp] "r"(pfp)
-        : "memory");
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
 }
 
 template <typename T>
-static inline void prefetch_1x(const T *pfp)
-{
-    __asm __volatile(
+static inline void prefetch_1x(const T *pfp) {
+    __asm __volatile (
         ASM_PREFETCH("[%[pfp]]")
-        :
-        : [pfp] "r"(pfp)
-        : "memory");
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
 }
+

diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
index dd74744..03f099d 100644
--- a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
+++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp

@@ -38,36 +38,33 @@
 
 #endif
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 #ifndef NO_MULTI_THREADING
-enum class BufferStatus
-{
+enum class BufferStatus {
     IDLE,
     POPULATING,
     BUSY
 };
 
-class Buffer
-{
+class Buffer {
 private:
-    const int   _maxusers; // Maximum permissible threads.
-    void *const _storage;  // Storage for buffer content.
+    const int                _maxusers;    // Maximum permissible threads.
+    void * const             _storage;     // Storage for buffer content.
 
-    int _numusers; // Actual number of threads (might be lower).
+    int                      _numusers;    // Actual number of threads (might be lower).
 
-    volatile BufferStatus _status = BufferStatus::IDLE; // Status
-    std::atomic_int       _users  = {};                 // How many users are still using the buffer.
-    volatile int          _index  = 0;                  // Which block of data currently resides in the buffer.
+    volatile BufferStatus    _status = BufferStatus::IDLE; // Status
+    std::atomic_int          _users = { };   // How many users are still using the buffer.
+    volatile int             _index = 0;   // Which block of data currently resides in the buffer.
 
-    std::mutex _lock = {};
+    std::mutex               _lock = { };
 #ifdef USE_SEMAPHORE
-    std::condition_variable _cv = {};
+    std::condition_variable  _cv = { };
 #endif
 
     template <typename T>
-    void populate_buffer(T func)
-    {
+    void populate_buffer(T func) {
         func(_storage);
 
         /* Now mark it as ready. */
@@ -78,17 +75,15 @@
             _cv.notify_all();
         }
 #else
-        _status     = BufferStatus::BUSY;
+        _status = BufferStatus::BUSY;
 #endif
     }
 
 public:
     Buffer(Buffer &) = delete;
-    Buffer &operator=(Buffer &) = delete;
+    Buffer &operator= (Buffer &) = delete;
 
-    Buffer(void *storage, int maxusers)
-        : _maxusers(maxusers), _storage(storage), _numusers(maxusers)
-    {
+    Buffer(void *storage, int maxusers) : _maxusers(maxusers), _storage(storage), _numusers(maxusers) {
         _status = BufferStatus::IDLE;
     }
 
@@ -99,38 +94,32 @@
      * If it's already being populated by another thread or is ready, return.
      */
     template <typename T>
-    void try_populate(const int index, T func)
-    {
-        for(;;)
-        {
+    void try_populate(const int index, T func) {
+        for (;;) {
 #ifdef USE_SEMAPHORE
             /* If it's busy with a previous index, wait on the semaphore. */
-            if((_status == BufferStatus::BUSY) && (_index != index))
-            {
+            if ((_status == BufferStatus::BUSY) && (_index != index)) {
                 std::unique_lock<std::mutex> ul(_lock);
 
-                if((_status == BufferStatus::BUSY) && (_index != index))
-                {
+                if ((_status == BufferStatus::BUSY) && (_index != index)) {
                     _cv.wait(ul);
                 }
             }
 #endif
             /* Return if another thread is populating it already. */
-            if((_index == index) && ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY)))
-            {
+            if ((_index == index) &&
+                ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY))) {
                 return;
             }
 
-            if(_status == BufferStatus::IDLE)
-            {
+            if (_status == BufferStatus::IDLE) {
                 std::lock_guard<std::mutex> guard(_lock);
 
                 /* If the buffer is still idle, we can grab it and populate it. */
-                if(_status == BufferStatus::IDLE)
-                {
+                if (_status == BufferStatus::IDLE) {
                     _status = BufferStatus::POPULATING;
-                    _index  = index;
-                    _users  = _numusers;
+                    _index = index;
+                    _users = _numusers;
                     break;
                 }
             }
@@ -141,26 +130,26 @@
     }
 
     template <typename T>
-    void *get(const int index, T func)
-    {
+    void *get(const int index, T func) {
         // Loop until we achieve something.
-        for(;;)
-        {
+        for (;;) {
             // If the index is correct and the buffer status is busy then we can
             // just return the content.  No locking is needed here as the index
             // cannot change (and status cannot change from BUSY) until all
             // users have finished.
-            if((_index == index) && (_status == BufferStatus::BUSY))
-            {
+            if ((_index == index) && (_status == BufferStatus::BUSY)) {
                 return _storage;
             }
+
+            /* If the buffer still has some previous content, or is being
+             * populated, we can wait with the semaphore.  */
 #ifdef USE_SEMAPHORE
-            if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
-            {
+            if (((_status == BufferStatus::BUSY) && (_index != index)) ||
+                 (_status == BufferStatus::POPULATING)) {
                 std::unique_lock<std::mutex> ul(_lock);
 
-                if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
-                {
+                if (((_status == BufferStatus::BUSY) && (_index != index)) ||
+                     (_status == BufferStatus::POPULATING)) {
                     _cv.wait(ul);
                 }
             }
@@ -168,17 +157,15 @@
 
             // If it's idle, we need to populate it.  The IDLE->POPULATING
             // transition requires the lock.
-            if(_status == BufferStatus::IDLE)
-            {
+            if (_status == BufferStatus::IDLE) {
                 std::lock_guard<std::mutex> guard(_lock);
 
                 /* If it's still idle, grab it.  Otherwise drop through and
                  * we'll do something else next time through the loop.  */
-                if(_status == BufferStatus::IDLE)
-                {
+                if (_status == BufferStatus::IDLE) {
                     _status = BufferStatus::POPULATING;
-                    _index  = index;
-                    _users  = _numusers;
+                    _index = index;
+                    _users = _numusers;
                     break;
                 }
             }
@@ -194,10 +181,8 @@
      * simply (atomically) decrement the user count, and if it's hit zero we
      * flag the buffer as idle.
      */
-    void release(void)
-    {
-        if(--_users == 0)
-        {
+    void release(void) {
+        if (--_users == 0) {
 #ifdef USE_SEMAPHORE
             std::unique_lock<std::mutex> ul(_lock);
             _status = BufferStatus::IDLE;
@@ -211,110 +196,91 @@
     }
 
     /* This is called to change the number of users. */
-    void set_numusers(int numusers)
-    {
+    void set_numusers(int numusers) {
         _numusers = std::min(numusers, _maxusers);
     }
 };
 
-class BufferManager
-{
+
+class BufferManager {
 private:
     /* This has to be a vector of Buffer *, because a Buffer cannot be moved
      * or copied due to atomic members. */
-    std::vector<Buffer *> _buffers = {};
-    const int             _maxthreads;
-    void *const           _storage;
+    std::vector<Buffer *> _buffers = { };
+    const int _maxthreads;
+    void * const _storage;
 
 public:
     BufferManager(BufferManager &) = delete;
-    BufferManager &operator=(BufferManager &) = delete;
+    BufferManager & operator=(BufferManager &) = delete;
 
     // Say how much storage is needed.
-    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
-    {
+    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) {
         return buffersize * ((maxthreads == 1) ? 1 : 3);
     }
 
-    BufferManager(const int maxthreads, const size_t buffersize, void *storage)
-        : _maxthreads(maxthreads), _storage(storage)
-    {
+    BufferManager(const int maxthreads, const size_t buffersize, void *storage) : _maxthreads(maxthreads), _storage(storage) {
         const int numbuffers = (maxthreads == 1) ? 1 : 3;
 
         /* We don't need any Buffer objects in single thread mode. */
-        if(_maxthreads == 1)
-        {
+        if (_maxthreads == 1) {
             return;
         }
 
         /* Use intptr_t to avoid performing arithmetic on a void * */
         intptr_t storage_int = reinterpret_cast<intptr_t>(_storage);
 
-        for(int i = 0; i < numbuffers; i++)
-        {
+        for (int i=0; i<numbuffers; i++) {
             _buffers.push_back(new Buffer(reinterpret_cast<void *>(storage_int), _maxthreads));
             storage_int += buffersize;
         }
     }
 
-    ~BufferManager()
-    {
-        while(_buffers.size())
-        {
+    ~BufferManager() {
+        while (_buffers.size()) {
             delete _buffers.back();
             _buffers.pop_back();
         }
     }
 
     template <typename T>
-    void *get(const int index, T func)
-    {
+    void *get(const int index, T func) {
         /* In single thread mode, we just directly call the populating
          * function on the (single) buffer, otherwise forward to the
          * relevant Buffer.  */
-        if(_maxthreads == 1)
-        {
+        if (_maxthreads==1) {
             func(_storage);
             return _storage;
-        }
-        else
-        {
+        } else {
             return _buffers[index % _buffers.size()]->get(index, func);
         }
     }
 
     template <typename T>
-    void try_populate(const int index, T func)
-    {
+    void try_populate(const int index, T func) {
         /* No need for this in single thread mode. */
-        if(_maxthreads == 1)
-        {
+        if (_maxthreads==1) {
             return;
         }
 
         _buffers[index % _buffers.size()]->try_populate(index, func);
     }
 
-    void release(const int index)
-    {
+    void release(const int index) {
         /* No need for this in single thread mode. */
-        if(_maxthreads == 1)
-        {
+        if (_maxthreads==1) {
             return;
         }
 
         _buffers[index % _buffers.size()]->release();
     }
 
-    void set_nthreads(int threads)
-    {
-        if(_maxthreads == 1)
-        {
+    void set_nthreads(int threads) {
+        if (_maxthreads==1) {
             return;
         }
 
-        for(unsigned int i = 0; i < _buffers.size(); i++)
-        {
+        for(unsigned int i=0; i<_buffers.size(); i++) {
             _buffers[i]->set_numusers(threads);
         }
     }
@@ -329,49 +295,35 @@
  * All the other methods do nothing.
  */
 
-class BufferManager
-{
+class BufferManager {
 private:
-    void *const _storage;
+    void * const _storage;
 
 public:
     BufferManager(BufferManager &) = delete;
-    BufferManager &operator=(BufferManager &) = delete;
+    BufferManager & operator=(BufferManager &) = delete;
 
-    BufferManager(const int maxthreads, const size_t buffersize, void *storage)
-        : _storage(storage)
-    {
-    }
+    BufferManager(const int maxthreads, const size_t buffersize, void *storage) : _storage(storage) { }
 
-    ~BufferManager()
-    {
-    }
+    ~BufferManager() { }
 
     // Say how much storage is needed.
-    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
-    {
+    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) {
         return buffersize;
     }
 
     template <typename T>
-    void try_populate(const int index, T func)
-    {
-    }
+    void try_populate(const int index, T func) { }
 
-    void release(const int index)
-    {
-    }
+    void release(const int index) { }
 
     template <typename T>
-    void *get(const int index, T func)
-    {
+    void *get(const int index, T func) {
         func(_storage);
         return _storage;
     }
 
-    void set_nthreads(int)
-    {
-    }
+    void set_nthreads(int) { }
 };
 
 #endif

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index d1180b1..fa12942 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp

@@ -30,33 +30,31 @@
 #include "gemm_common.hpp"
 #include "gemm_interleaved.hpp"
 
-#include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_hgemm_24x8.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a32_sgemm_8x6.hpp"
 
-namespace arm_gemm
-{
-template <>
+namespace arm_gemm {
+
+template<>
 UniqueGemmCommon<__fp16, __fp16> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                                       const unsigned int nbatches, const unsigned int nmulti,
                                       const bool trA, const bool trB, const __fp16 alpha, const __fp16 beta,
-                                      const int maxthreads, const bool pretransposed_hint)
-{
+                                      const int maxthreads, const bool pretransposed_hint) {
 #ifdef __aarch64__
 
-    // Only consider the native FP16 kernel if it will get built.
+// Only consider the native FP16 kernel if it will get built.
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // If the compiler is configured to enable this feature always, then assume it is available at runtime too.
-    const bool use_fp16 = true;
+    const bool use_fp16=true;
 #else
     // Otherwise, detect at runtime via CPUInfo.
-    const bool use_fp16 = ci.has_fp16();
+    const bool use_fp16=ci.has_fp16();
 #endif
 
     // If FP16 is supported, use it.
-    if(use_fp16)
-    {
+    if (use_fp16) {
         return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
     }
 #endif

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index c093761..99f061b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp

@@ -29,15 +29,15 @@
 #include "gemv_native_transposed.hpp"
 #include "gemv_pretransposed.hpp"
 
-#include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a64_sgemm_native_16x4.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_sgemv_trans.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemm_native_16x4.hpp"
 
-namespace arm_gemm
-{
-template <>
+namespace arm_gemm {
+
+template<>
 UniqueGemmCommon<float, float> gemm<float, float>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                                                   const unsigned int nbatches, const unsigned int nmulti,
                                                   const bool trA, const bool trB, const float alpha, const float beta,
@@ -46,18 +46,17 @@
     if (M==1 && nbatches>1) {
         return UniqueGemmCommon<float, float> (new GemvBatched<float, float>(ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
     }
+
 #ifdef __aarch64__
     /* Cases in priority order */
     /* GemvPretransposed: requires M=1, alpha=1, and transposed hint set.  nbatches must be 1 or we would have returned above so don't test. */
-    if(M == 1 && alpha == 1.0f && pretransposed_hint)
-    {
-        return UniqueGemmCommon<float, float>(new GemvPretransposed<sgemv_pretransposed, float, float>(&ci, N, K, nmulti, trB, beta));
+    if (M==1 && alpha==1.0f && pretransposed_hint) {
+        return UniqueGemmCommon<float, float> (new GemvPretransposed<sgemv_pretransposed, float, float>(&ci, N, K, nmulti, trB, beta));
     }
 
     /* GemvNativeTransposed: requires M=1, no trA or trB, doesn't handle alpha */
-    if(M == 1 && alpha == 1.0f && !trA && !trB)
-    {
-        return UniqueGemmCommon<float, float>(new GemvNativeTransposed<sgemv_trans, float, float>(&ci, N, K, nmulti, beta));
+    if (M==1 && alpha==1.0f && !trA && !trB) {
+        return UniqueGemmCommon<float, float> (new GemvNativeTransposed<sgemv_trans, float, float>(&ci, N, K, nmulti, beta));
     }
 
     /* Native GEMM: requires K at least 4, N a multiple of 16, doesn't
@@ -69,9 +68,9 @@
     }
 
     /* Blocked GEMM, handles all cases. */
-    return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_12x8, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_12x8, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
 #else
-    return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_8x6, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_8x6, float, float>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
 #endif
 }
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index 7669fe0..3175419 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp

@@ -29,14 +29,13 @@
 
 #include "kernels/a64_gemm_s16_12x8.hpp"
 
-namespace arm_gemm
-{
-template <>
+namespace arm_gemm {
+
+template<>
 UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                                                           const unsigned int nbatches, const unsigned int nmulti,
                                                           const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
-                                                          const int maxthreads, const bool pretransposed_hint)
-{
+                                                          const int maxthreads, const bool pretransposed_hint) {
     return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
 }
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index f134062..7eff47d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp

@@ -27,20 +27,18 @@
 #include "gemm_common.hpp"
 #include "gemm_interleaved.hpp"
 
+#include "kernels/a64_gemm_s8_4x4.hpp"
 #include "kernels/a64_gemm_s16_12x8.hpp"
 #include "kernels/a64_gemm_s8_12x8.hpp"
-#include "kernels/a64_gemm_s8_4x4.hpp"
 
-namespace arm_gemm
-{
-template <>
+namespace arm_gemm {
+
+template<>
 UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                                                         const unsigned int nbatches, const unsigned int nmulti,
                                                         const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
-                                                        const int maxthreads, const bool pretransposed_hint)
-{
-    if(ci.has_dotprod())
-    {
+                                                        const int maxthreads, const bool pretransposed_hint) {
+    if (ci.has_dotprod()) {
         // Dot product supporting CPUs.  This family has a special version for A55r1.
         return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
     }

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 32c65cd..c304edd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp

@@ -23,8 +23,8 @@
  */
 #pragma once
 
-#include <assert.h>
 #include <stdio.h>
+#include <assert.h>
 
 #include <algorithm>
 
@@ -41,23 +41,22 @@
 
 // Some macros used to decide how much working space to allocate.
 // Round allocations up to the next cache line.
-#define ALLOC_ROUND 64
-#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND)
+#define ALLOC_ROUND	64
+#define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
 
 // Implementation of the GemmCommon abstract class.
 //
 // This implementation interleaves the source matrices in blocks - good for
 // larger matrices.
-namespace arm_gemm
-{
-template <typename strategy, typename To, typename Tr>
-class GemmInterleaved : public GemmCommon<To, Tr>
-{
+namespace arm_gemm {
+
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type  Tri;
+    typedef typename strategy::result_type Tri;
 
     /* const properties set by constructor */
-    const CPUInfo *const _ci;
+    const CPUInfo * const _ci;
 
     const unsigned int _Msize;
     const unsigned int _Nsize;
@@ -72,173 +71,138 @@
     const Tr _alpha;
     const Tr _beta;
 
-    const unsigned int _maxthreads;
-    const bool         _pretransposed;
+    const int _maxthreads;
+    int _nthreads;
+    const bool _pretransposed;
 
     /* Blocking info */
-    unsigned int _k_block = 0;
-    unsigned int _x_block = 0;
-    unsigned int _Mround  = 0;
+    unsigned int _k_block=0;
+    unsigned int _x_block=0;
+    unsigned int _Mround=0;
 
     /* Working space, pretransposed buffer, buffer manager */
-    const Toi     *_B_transposed  = nullptr;
-    BufferManager *_bm            = nullptr;
-    void          *_working_space = nullptr;
+    const Toi *_B_transposed=nullptr;
+    BufferManager *_bm=nullptr;
+    void *_working_space=nullptr;
 
     /* We will need to walk through the blocks of B in a few contexts, so
      * factor that out.  */
-    class blockwalker
-    {
+    class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
         const GemmInterleaved<strategy, To, Tr> &_parent;
 
-        /* K and X and multi parameters for current iteration. */
-        unsigned int _k0 = 0, _x0 = 0, _multi = 0;
+        /* K, X and multi parameters for current iteration. */
+        unsigned int _k0=0, _x0=0, _multi=0;
 
-        unsigned int _index     = 0;
-        bool         _done      = false;
-        bool         _newkblock = true;
-        bool         _newmulti  = true;
+        unsigned int _index=0;
+        bool _done=false;
+        bool _newkblock=true;
+        bool _newmulti=true;
 
     public:
-        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent)
-            : _parent(parent)
-        {
-        }
+        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
 
-        unsigned int xmax()
-        {
+        unsigned int xmax() {
             return std::min(_x0 + _parent._x_block, _parent._Nsize);
         }
 
-        unsigned int kmax()
-        {
+        unsigned int kmax() {
             return std::min(_k0 + _parent._k_block, _parent._Ksize);
         }
 
         /* Advance to the next block, return false at the end. */
-        bool advance(void)
-        {
-            if(_done)
-            {
+        bool advance(void) {
+            if (_done) {
                 return false;
             }
 
-            _newkblock = false;
+            _newkblock=false;
             _x0 += _parent._x_block;
-            if(_x0 >= _parent._Nsize)
-            {
-                _x0 = 0;
+            if (_x0 >= _parent._Nsize) {
+                _x0=0;
                 _k0 += _parent._k_block;
-                if(_k0 >= _parent._Ksize)
-                {
-                    _k0 = 0;
+                if (_k0 >= _parent._Ksize) {
+                    _k0=0;
                     _multi++;
-                    if(_multi >= _parent._nmulti)
-                    {
-                        _done = true;
+                    if (_multi >= _parent._nmulti) {
+                        _done=true;
                         return false;
                     }
-                    _newmulti = true;
+                    _newmulti=true;
                 }
-                _newkblock = true;
+                _newkblock=true;
             }
             _index++;
 
             return true;
         }
 
-        unsigned int k0(void)
-        {
-            return _k0;
-        }
-        unsigned int x0(void)
-        {
-            return _x0;
-        }
-        unsigned int multi(void)
-        {
-            return _multi;
-        }
-        unsigned int index(void)
-        {
-            return _index;
-        }
-        bool done(void)
-        {
-            return _done;
-        }
-        bool newkblock(void)
-        {
-            return _newkblock;
-        }
+        unsigned int k0(void) { return _k0; }
+        unsigned int x0(void) { return _x0; }
+        unsigned int multi(void) { return _multi; }
+        unsigned int index(void) { return _index; }
+        bool done(void) { return _done; }
+        bool newkblock(void) { return _newkblock; }
     };
 
     // A working size: One of these needed, regardless of thread count.  Divided according to window.
-    size_t get_a_working_size() const
-    {
+    size_t get_a_working_size() const {
         return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
     }
 
     // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
-    size_t get_b_working_size() const
-    {
+    size_t get_b_working_size() const {
         return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
     }
 
     // C working size: One needed per thread.
-    size_t get_c_working_size() const
-    {
+    size_t get_c_working_size() const {
         return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height);
     }
 
     // Internal execute function.
     // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
-    template <bool pretransposed>
-    void execute_internal(unsigned int start, unsigned int end, int threadid)
-    {
+    template<bool pretransposed>
+    void execute_internal(unsigned int start, unsigned int end, int threadid) {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-
         strategy strat(_ci);
 
         blockwalker current(*this);
-        blockwalker next = current;
+        blockwalker next=current;
 
         /* Translate 'start' and 'end' into a position within the batches and rows. */
         const unsigned int window_per_batch = _Mround / strategy::out_height;
-        unsigned int       batch_0          = start / window_per_batch;
-        unsigned int       batch_end        = end / window_per_batch;
+        unsigned int batch_0   = start / window_per_batch;
+        unsigned int batch_end = end   / window_per_batch;
 
         /* Compute the M values to operate on */
         unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height;
         unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height;
 
         /* Make sure we've been set up correctly. */
-        if(pretransposed)
-        {
+        if (pretransposed) {
             assert(_B_transposed);
-        }
-        else
-        {
+        } else {
             assert(_bm);
         }
 
         assert(_working_space);
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
 
-        // Private buffers.  Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer.
+        // Private buffers.  Treat working_space as an array of C buffers
+        // (one per thread) first, followed by the (window-divided) A
+        // buffer.
         // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
-        Toi *const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
-        Tri *const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
 
         // Shared buffers - these come either from BufferManager or _B_transposed.
         const Toi *b_panel;
 
-        if(pretransposed)
-        {
+        if (pretransposed) {
             b_panel = _B_transposed;
         }
 
@@ -247,33 +211,28 @@
         // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
         int kern_k = 0;
 
-        for(; !current.done(); current.advance())
-        {
-            if(current.newkblock())
-            {
+        for (;!current.done();current.advance()) {
+            if (current.newkblock()) {
 #ifdef CYCLE_PROFILING
-                auto p = prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height * (current.kmax() - current.k0()) * sizeof(Toi));
+                auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height * (current.kmax()-current.k0()) * sizeof(Toi));
 #endif
-                for(unsigned int batch = batch_0; batch <= batch_end; batch++)
-                {
-                    unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
                     unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
 
-                    if(first_m >= last_m)
+                    if (first_m >= last_m)
                         continue;
-                    if(_trA ^ strategy::A_transpose)
-                    {
+
+                    if (_trA ^ strategy::A_transpose) {
                         Transform<strategy::A_interleave, strategy::A_block, true>(
-                            a_panel + ((batch * _Mround + first_m) * _k_block),
-                            this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                            this->_lda, first_m, last_m, current.k0(), current.kmax());
-                    }
-                    else
-                    {
+                                   a_panel + ((batch * _Mround + first_m) * _k_block),
+                                   this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                   this->_lda, first_m, last_m, current.k0(), current.kmax());
+                    } else {
                         Transform<strategy::A_interleave, strategy::A_block, false>(
-                            a_panel + ((batch * _Mround + first_m) * _k_block),
-                            this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                            this->_lda, first_m, last_m, current.k0(), current.kmax());
+                                   a_panel + ((batch * _Mround + first_m) * _k_block),
+                                   this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                   this->_lda, first_m, last_m, current.k0(), current.kmax());
                     }
                 }
 
@@ -284,8 +243,7 @@
 
             int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width);
 
-            if(!pretransposed)
-            {
+            if (!pretransposed) {
                 /* Look ahead to the next block and populate it if necessary.
                  * This avoids the populate operation becoming a bottleneck, and
                  * helps keep the threads synchronized (the first thread to get
@@ -294,71 +252,60 @@
                  * If we are running single threaded, bm->try_populate() will do
                  * nothing.
                  */
-                if(next.advance())
-                {
-                    _bm->try_populate(next.index(), [&](void *buffer)
-                    {
+                if (next.advance()) {
+                    _bm->try_populate(next.index(), [&](void *buffer) {
 #ifdef CYCLE_PROFILING
-                        auto p = prof.ScopedProfiler(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi));
+                        auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi));
 #endif
 
                         Toi *b_panel = reinterpret_cast<Toi *>(buffer);
-                        if(_trB ^ strategy::B_transpose)
-                        {
+                        if (_trB ^ strategy::B_transpose) {
                             Transform<strategy::B_interleave, strategy::B_block, true>(
-                                b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
-                                next.x0(), next.xmax(), next.k0(), next.kmax());
-                        }
-                        else
-                        {
+                                       b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+                                       next.x0(), next.xmax(), next.k0(), next.kmax());
+                        } else {
                             Transform<strategy::B_interleave, strategy::B_block, false>(
-                                b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
-                                next.x0(), next.xmax(), next.k0(), next.kmax());
+                                       b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+                                       next.x0(), next.xmax(), next.k0(), next.kmax());
                         }
                     });
                 }
+
                 /* Get the buffer for this iteration from the BufferManager. */
-                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv)
-                {
+                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) {
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi));
+                    auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi));
 #endif
 
                     Toi *b_panel = reinterpret_cast<Toi *>(bpv);
-                    if(_trB ^ strategy::B_transpose)
-                    {
+                    if (_trB ^ strategy::B_transpose) {
                         Transform<strategy::B_interleave, strategy::B_block, true>(
-                            b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
-                            current.x0(), current.xmax(), current.k0(), current.kmax());
-                    }
-                    else
-                    {
+                                   b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+                                   current.x0(), current.xmax(), current.k0(), current.kmax());
+                    } else {
                         Transform<strategy::B_interleave, strategy::B_block, false>(
-                            b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
-                            current.x0(), current.xmax(), current.k0(), current.kmax());
+                                   b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+                                   current.x0(), current.xmax(), current.k0(), current.kmax());
                     }
-
                 }));
             }
 
             /* Do the actual work. */
-            for(unsigned int batch = batch_0; batch <= batch_end; batch++)
-            {
-                unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
                 unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
 
                 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
 
-                if(first_m >= last_m)
+                if (first_m >= last_m)
                     continue;
 
-                for(unsigned int y = first_m; y < last_m; y += strategy::out_height)
-                {
+                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height) {
                     unsigned int ymax = std::min(_Msize, y + strategy::out_height);
 
                     {
 #ifdef CYCLE_PROFILING
-                        auto p = prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k));
+                        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k));
 #endif
 
                         strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
@@ -368,22 +315,19 @@
 
                     {
 #ifdef CYCLE_PROFILING
-                        auto p = prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)));
+                        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)));
 #endif
                         MergeResults<strategy::out_width, strategy::out_height>(
-                            this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
-                            c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
-                            _alpha, (current.k0() == 0 ? _beta : static_cast<Tr>(1)));
+                                      this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
+                                      c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
+                                      _alpha, (current.k0()==0 ? _beta : static_cast<Tr>(1)));
                     }
                 }
             }
 
-            if(pretransposed)
-            {
+            if (pretransposed) {
                 b_panel += (bblocks * strat.out_width * kern_k);
-            }
-            else
-            {
+            } else {
                 _bm->release(current.index());
             }
         }
@@ -391,14 +335,15 @@
 
 public:
     GemmInterleaved(GemmInterleaved &) = delete;
-    GemmInterleaved &operator=(GemmInterleaved &) = delete;
+    GemmInterleaved & operator= (GemmInterleaved &) = delete;
 
     /* Constructor */
     GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K,
                     const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
-                    const Tr alpha, const Tr beta, const int maxthreads, const bool pretransposed)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed)
-    {
+                    const Tr alpha, const Tr beta, const int maxthreads, const bool pretransposed) :
+                    _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
+                    _trA(trA), _trB(trB), _alpha(alpha), _beta(beta),
+                    _maxthreads(maxthreads), _nthreads(maxthreads), _pretransposed(pretransposed)  {
         const unsigned int L1_size = ci->get_L1_cache_size();
         const unsigned int L2_size = ci->get_L2_cache_size();
 
@@ -426,7 +371,8 @@
 
         // x_block: Work out how many rows (of length k_block) will fit in the L2
         // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block);
+        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) /
+                  (sizeof(Toi) * _k_block);
 
         // Needs to be (at least a single) multiple of the kernel output width.
         _x_block /= strategy::out_width;
@@ -434,7 +380,7 @@
 
         // And tune to the presented problem size.
         int num_x_blocks = iceildiv(N, _x_block);
-        _x_block         = iceildiv(N, num_x_blocks);
+        _x_block = iceildiv(N, num_x_blocks);
 
         _x_block = iceildiv(_x_block, strategy::out_width);
         _x_block *= strategy::out_width;
@@ -450,45 +396,36 @@
     // out work in units of out_height.  Factor batches into the window, but
     // not multi for now (as this would cause problems with the buffer
     // manager).
-
-    unsigned int get_window_size() const override
-    {
+    unsigned int get_window_size() const override {
         // _Mround is a multiple of out_height by definition.
         return (_Mround / strategy::out_height) * _nbatches;
     }
 
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
-    void set_nthreads(int nthreads) override
-    {
-        if(_bm)
-        {
-            _bm->set_nthreads(nthreads);
+    void set_nthreads(int nthreads) override {
+        _nthreads = std::min(nthreads, _maxthreads);
+        if (_bm) {
+            _bm->set_nthreads(_nthreads);
         }
     }
 
     // Execute
-    void execute(unsigned int start, unsigned int end, int threadid) override
-    {
-        if(_pretransposed)
-        {
+    void execute(unsigned int start, unsigned int end, int threadid) override {
+        if (_pretransposed) {
             execute_internal<true>(start, end, threadid);
-        }
-        else
-        {
+        } else {
             execute_internal<false>(start, end, threadid);
         }
     }
 
     // Interface implementation - working space
-    size_t get_working_size() const override
-    {
+    size_t get_working_size() const override {
         // In all cases, we need one A buffer plus a C buffer per thread.
         size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
 
         // For pretransposed case, there is no working space needed for B.
         // Otherwise, we need a BufferManager.
-        if(!_pretransposed)
-        {
+        if (!_pretransposed) {
             size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
         }
 
@@ -497,33 +434,28 @@
         return size;
     }
 
-    void set_working_space(void *working_space) override
-    {
+    void set_working_space(void *working_space) override {
         // Make sure everything ends up cache line aligned
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
-        intptr_t working_space_int   = reinterpret_cast<intptr_t>(working_space);
+        intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
 
-        size_t diff = 0;
+        size_t diff=0;
 
-        if(working_space_int & 0x3F)
-        {
+        if (working_space_int & 0x3F) {
             diff = 0x40 - (working_space_int & 0x3F);
         }
 
         working_space_bytes += diff;
 
-        if(_pretransposed)
-        {
+        if (_pretransposed) {
             // Pretransposed case: just set internal pointer to parameter value.
             _working_space = reinterpret_cast<void *>(working_space_bytes);
-        }
-        else
-        {
+        } else {
             // Otherwise, use the first part of the working space for the buffer manager.
             // It's legal to call this again so don't leak a buffer manager if it already existed.
             delete _bm;
 
-            _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
+            _bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
 
             working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
 
@@ -532,24 +464,20 @@
     }
 
     // Interface implementation - pretransposed
-    bool B_is_pretransposed() const override
-    {
+    bool B_is_pretransposed() const override {
         return _pretransposed;
     }
 
-    bool B_pretranspose_required() const override
-    {
-        return _pretransposed && (_B_transposed == nullptr);
+    bool B_pretranspose_required() const override {
+        return _pretransposed && (_B_transposed==nullptr);
     }
 
     // TODO: this could almost certainly be considerably simpler.
-    size_t get_B_pretransposed_array_size() const override
-    {
-        size_t      total = 0;
+    size_t get_B_pretransposed_array_size() const override {
+        size_t total=0;
         blockwalker current(*this);
 
-        do
-        {
+        do {
             /* Figure out the size of each block. */
             size_t x_size = (current.xmax() - current.x0());
             size_t k_size = (current.kmax() - current.k0());
@@ -562,20 +490,17 @@
             k_size *= strategy::k_unroll;
 
             total += x_size * k_size * sizeof(Toi);
-        }
-        while(current.advance());
+        } while (current.advance());
 
         return total;
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
-    {
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
-        Toi        *buffer = reinterpret_cast<Toi *>(in_buffer);
-        _B_transposed      = buffer;
+        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+        _B_transposed = buffer;
 
-        do
-        {
+        do {
             /* Figure out the size of each block. */
             size_t x_size = (current.xmax() - current.x0());
             size_t k_size = (current.kmax() - current.k0());
@@ -587,31 +512,25 @@
             k_size = iceildiv(k_size, strategy::k_unroll);
             k_size *= strategy::k_unroll;
 
-            if(_trB ^ strategy::B_transpose)
-            {
+            if (_trB ^ strategy::B_transpose) {
                 Transform<strategy::B_interleave, strategy::B_block, true>(
-                    buffer, B + (current.multi() * B_multi_stride), ldb,
-                    current.x0(), current.xmax(), current.k0(), current.kmax());
-            }
-            else
-            {
+                           buffer, B + (current.multi() * B_multi_stride), ldb,
+                           current.x0(), current.xmax(), current.k0(), current.kmax());
+            } else {
                 Transform<strategy::B_interleave, strategy::B_block, false>(
-                    buffer, B + (current.multi() * B_multi_stride), ldb,
-                    current.x0(), current.xmax(), current.k0(), current.kmax());
+                           buffer, B + (current.multi() * B_multi_stride), ldb,
+                           current.x0(), current.xmax(), current.k0(), current.kmax());
             }
 
             buffer += (x_size * k_size);
-        }
-        while(current.advance());
+        } while (current.advance());
     }
 
-    void set_pretransposed_B_data(void *in_buffer) override
-    {
+    void set_pretransposed_B_data(void *in_buffer) override {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
 
-    ~GemmInterleaved() override
-    {
+    ~GemmInterleaved() override {
         delete _bm;
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index 695236b..6fed645 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp

@@ -34,8 +34,8 @@
 #include "profiler.hpp"
 #endif
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Implementation of the GemmCommon abstract class.
 //
 // This is implementation is for native GEMM with no transposition.
@@ -43,11 +43,10 @@
 // By default the source data is used in-place, but if type conversion is
 // needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
 
-template <typename strategy, typename To, typename Tr>
-class GemmNative : public GemmCommon<To, Tr>
-{
+template<typename strategy, typename To, typename Tr>
+class GemmNative : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type  Tri;
+    typedef typename strategy::result_type Tri;
 
     const unsigned int _Msize;
     const unsigned int _Nsize;
@@ -58,36 +57,34 @@
 
     Tr _beta;
 
-    const CPUInfo *const _ci;
+    const CPUInfo * const _ci;
 
-    unsigned int k_block = 0;
-    unsigned int n_block = 0;
+    unsigned int k_block=0;
+    unsigned int n_block=0;
 
 public:
     GemmNative(GemmNative &) = delete;
-    GemmNative &operator=(GemmNative &) = delete;
+    GemmNative & operator= (GemmNative &) = delete;
 
-    GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta)
-        : _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci)
-    {
+    GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) :
+        _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) {
         /* For now don't do any blocking. TODO: figure out if we should. */
         k_block = K;
         n_block = N;
     }
 
     // Window is number of out_height blocks
-    unsigned int get_window_size() const override
-    {
+    unsigned int get_window_size() const override {
         return iceildiv(_Msize, strategy::out_height) * _nbatches * _nmultis;
     }
 
     // Actually execute the GEMM.
-    void execute(unsigned int start, unsigned int end, int) override
-    {
+    void execute(unsigned int start, unsigned int end, int) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-        strategy           strat(_ci);
+        strategy strat(_ci);
+
         const unsigned int window_per_batch = iceildiv(_Msize, strategy::out_height);
         const unsigned int window_per_multi = window_per_batch * _nbatches;
 
@@ -103,27 +100,24 @@
         static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
         static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
 
-        for(unsigned int multi = first_multi; multi <= last_multi; multi++)
-        {
+        for (unsigned int multi=first_multi; multi<=last_multi; multi++) {
             const unsigned int batch_0   = (multi == first_multi) ? first_batch : 0;
-            const unsigned int batch_max = (multi == last_multi) ? last_batch : _nbatches - 1;
+            const unsigned int batch_max = (multi == last_multi) ? last_batch : (_nbatches-1);
 
-            for(unsigned int batch = batch_0; batch <= batch_max; batch++)
-            {
-                const unsigned int m_start = ((multi == first_multi) && (batch == first_batch)) ? first_row : 0;
-                const unsigned int m_end   = ((multi == last_multi) && (batch == last_batch)) ? last_row : _Msize;
+            for (unsigned int batch=batch_0; batch <= batch_max; batch++) {
+                const unsigned int m_start = ((multi == first_multi) && (batch==first_batch)) ? first_row : 0;
+                const unsigned int m_end = ((multi == last_multi) && (batch==last_batch)) ? last_row : _Msize;
 
-                for(unsigned int y0 = m_start; y0 < m_end; y0 += strategy::out_height)
-                {
+                for (unsigned int y0=m_start; y0<m_end; y0+=strategy::out_height) {
                     const unsigned int ymax = std::min(y0 + strategy::out_height, m_end);
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax - y0) * _Nsize * _Ksize);
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * _Nsize * _Ksize);
 #endif
 
                     strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
                                  this->_Bptr + (multi * this->_B_multi_stride), this->_ldb,
                                  this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc), this->_ldc,
-                                 _beta, (ymax - y0), _Nsize, _Ksize);
+                                 _beta, (ymax-y0), _Nsize, _Ksize);
                 }
             }
         }

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 8f1f377..4e8b811 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp

@@ -29,14 +29,13 @@
 
 #include "kernels/a64_gemm_u16_12x8.hpp"
 
-namespace arm_gemm
-{
-template <>
+namespace arm_gemm {
+
+template<>
 UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                                                               const unsigned int nbatches, const unsigned int nmulti,
                                                               const bool trA, const bool trB, uint32_t alpha, uint32_t beta,
-                                                              const int maxthreads, const bool pretransposed_hint)
-{
+                                                              const int maxthreads, const bool pretransposed_hint) {
     return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
 }
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 0e9f3f2..321aa65 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp

@@ -27,19 +27,17 @@
 #include "gemm_common.hpp"
 #include "gemm_interleaved.hpp"
 
-#include "kernels/a64_gemm_u8_12x8.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_gemm_u8_12x8.hpp"
 
-namespace arm_gemm
-{
-template <>
+namespace arm_gemm {
+
+template<>
 UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                                                             const unsigned int nbatches, const unsigned int nmulti,
                                                             const bool trA, const bool trB, const uint32_t alpha, const uint32_t beta,
-                                                            const int maxthreads, const bool pretransposed_hint)
-{
-    if(ci.has_dotprod())
-    {
+                                                            const int maxthreads, const bool pretransposed_hint) {
+    if (ci.has_dotprod()) {
         // Dot product supporting CPUs.  This family has a special version for A55r1.
         return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(&ci, M, N, K, nbatches, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
     }

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index bb09770..d91b44b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp

@@ -25,84 +25,70 @@
 
 #include "arm_gemm.hpp"
 
-namespace arm_gemm
-{
+namespace arm_gemm {
 
 /* "Batched GEMV" (where M=1 and nbatches>1) can be executed much more
  * efficiently as a GEMM (with M'=nbatches and nbatches'=1).  This wrapper
  * implements this.  */
-template <typename To, typename Tr>
-class GemvBatched : public GemmCommon<To, Tr>
-{
+template<typename To, typename Tr>
+class GemvBatched : public GemmCommon<To, Tr> {
 private:
     UniqueGemmCommon<To, Tr> _subgemm = nullptr;
 
 public:
     GemvBatched(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
                 const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
-                const To alpha, const To beta, const int maxthreads, const bool pretransposed_hint)
-    {
+                const To alpha, const To beta, const int maxthreads, const bool pretransposed_hint) {
         /* Just create a subgemm with batches->M */
-        _subgemm = gemm<To, Tr>(ci, nbatches, N, K, 1, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint);
+        _subgemm = gemm<To,Tr>(ci, nbatches, N, K, 1, nmulti, trA, trB, alpha, beta, maxthreads, pretransposed_hint);
     }
 
     void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
                     const To *B, const int ldb, const int B_multi_stride,
-                    Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override
-    {
+                          Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride) override {
         /* A and C's batch stride becomes their new row stride.  New batch stride is 0 as nbatches for subgemm is always 1. */
         _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
                              B, ldb, B_multi_stride,
                              C, C_batch_stride, 0, C_multi_stride);
     }
 
-    unsigned int get_window_size() const override
-    {
+    unsigned int get_window_size() const override {
         return _subgemm->get_window_size();
     }
 
-    void set_nthreads(int nthreads) override
-    {
+    void set_nthreads(int nthreads) override {
         _subgemm->set_nthreads(nthreads);
     }
 
-    void execute(unsigned int start, unsigned int end, int threadid) override
-    {
+    void execute(unsigned int start, unsigned int end, int threadid) override {
         _subgemm->execute(start, end, threadid);
     }
 
-    size_t get_working_size() const override
-    {
+    size_t get_working_size() const override {
         return _subgemm->get_working_size();
     }
 
-    void set_working_space(void *space) override
-    {
+    void set_working_space(void *space) override {
         _subgemm->set_working_space(space);
     }
 
-    bool B_is_pretransposed() const override
-    {
+    bool B_is_pretransposed() const override {
         return _subgemm->B_is_pretransposed();
     }
 
-    bool B_pretranspose_required() const override
-    {
+    bool B_pretranspose_required() const override {
         return _subgemm->B_pretranspose_required();
     }
 
-    size_t get_B_pretransposed_array_size() const override
-    {
+    size_t get_B_pretransposed_array_size() const override {
         return _subgemm->get_B_pretransposed_array_size();
     }
 
-    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override
-    {
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         _subgemm->pretranspose_B_array(buffer, B, ldb, B_multi_stride);
     }
 
-    void set_pretransposed_B_data(void *buffer) override
-    {
+    void set_pretransposed_B_data(void *buffer) override {
         _subgemm->set_pretransposed_B_data(buffer);
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index e5cc79e..241c5fe 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp

@@ -34,8 +34,8 @@
 #include "profiler.hpp"
 #endif
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Implementation of the GemmCommon abstract class.
 //
 // This is implementation is for a "native" (no-transform) GEMV with a
@@ -43,53 +43,48 @@
 //
 // As a native operation the source data is used in-place, so the internal
 // and external operand/result types must match.
-template <typename strategy, typename To, typename Tr>
-class GemvNativeTransposed : public GemmCommon<To, Tr>
-{
+template<typename strategy, typename To, typename Tr>
+class GemvNativeTransposed : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type  Tri;
+    typedef typename strategy::result_type Tri;
 
     const unsigned int _Nsize;
     const unsigned int _Ksize;
+
     const unsigned int _nmultis;
 
     const Tr _beta;
 
-    const CPUInfo *const _ci;
+    const CPUInfo * const _ci;
 
-    unsigned int m_block = 0;
-    unsigned int n_block = 0;
+    unsigned int m_block=0;
+    unsigned int n_block=0;
 
 public:
     GemvNativeTransposed(GemvNativeTransposed &) = delete;
-    GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete;
+    GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
 
-    GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta)
-        : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci)
-    {
+    GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) {
         /* For now don't do any blocking. TODO: figure out if we should. */
         m_block = K;
         n_block = N;
     }
 
     // Window is number of out_width blocks times number of multis.
-    unsigned int get_window_size() const override
-    {
+    unsigned int get_window_size() const override {
         return iceildiv(_Nsize, strategy::out_width) * _nmultis;
     }
 
     // Actually execute the GEMV.
-    void execute(unsigned int start, unsigned int end, int) override
-    {
+    void execute(unsigned int start, unsigned int end, int) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-
         strategy strat(_ci);
 
         const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
-        const unsigned int multi_0          = start / window_per_multi;
-        const unsigned int multi_end        = end / window_per_multi;
+        const unsigned int multi_0   = start / window_per_multi;
+        const unsigned int multi_end = end   / window_per_multi;
 
         const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width;
         const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width;
@@ -97,27 +92,25 @@
         static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
         static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
 
-        for(unsigned int multi = multi_0; multi <= multi_end; multi++)
-        {
-            const unsigned int n_start = (multi == multi_0) ? n_0 : 0;
-            const unsigned int n_end   = (multi == multi_end) ? n_max : _Nsize;
+        for (unsigned int multi=multi_0; multi<=multi_end; multi++) {
+            const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
+            const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize;
 
-            if(n_end <= n_start)
+            if (n_end <= n_start)
                 continue;
 
-            for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
-            {
+            for (unsigned int m0=0; m0<_Ksize; m0+=m_block) {
                 unsigned int mmax = std::min(m0 + m_block, _Ksize);
-                for(unsigned int n0 = n_start; n0 < n_end; n0 += n_block)
-                {
+
+                for (unsigned int n0=n_start; n0<n_end; n0+=n_block) {
                     unsigned int nmax = std::min(n0 + n_block, n_end);
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n0));
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n0));
 #endif
                     strat.kernel(this->_Bptr + (multi * this->_B_multi_stride) + (m0 * this->_ldb) + n0,
                                  this->_Aptr + (multi * this->_A_multi_stride) + m0,
                                  this->_Cptr + (multi * this->_C_multi_stride) + n0,
-                                 _beta, this->_ldb, (mmax - m0), (nmax - n0));
+                                 _beta, this->_ldb, (mmax-m0), (nmax-n0));
                 }
             }
         }

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 770ee03..e53ddb2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp

@@ -34,66 +34,64 @@
 #include "profiler.hpp"
 #endif
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Implementation of the GemmCommon abstract class.
 //
 // This is implementation is for GEMV with pretransposition.
+//
 // batches are not supported as a batched GEMV makes no sense (can be converted to a GEMM).
-
-template <typename strategy, typename To, typename Tr>
-class GemvPretransposed : public GemmCommon<To, Tr>
-{
+template<typename strategy, typename To, typename Tr>
+class GemvPretransposed : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type  Tri;
+    typedef typename strategy::result_type Tri;
 
     const unsigned int _Nsize;
     const unsigned int _Ksize;
+
     const unsigned int _nmultis;
 
     const bool _trB;
 
     const Tr _beta;
 
-    const CPUInfo *const _ci;
-    const unsigned int   _buffer_per_multi;
+    const CPUInfo * const _ci;
 
-    unsigned int m_block = 0;
-    unsigned int n_block = 0;
+    const unsigned int _buffer_per_multi;
+
+    unsigned int m_block=0;
+    unsigned int n_block=0;
 
     const Toi *_A_pretransposed = nullptr;
 
 public:
     GemvPretransposed(GemvPretransposed &) = delete;
-    GemvPretransposed &operator=(GemvPretransposed &) = delete;
+    GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
-    GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta)
-        : _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci), _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave)
-    {
+    GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) :
+        _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci),
+        _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
         /* For now don't do any blocking. TODO: figure out if we should. */
         m_block = K;
         n_block = N;
     }
 
     // Window is number of out_width blocks, times number of multis.
-    unsigned int get_window_size() const override
-    {
+    unsigned int get_window_size() const override {
         return iceildiv(_Nsize, strategy::out_width) * _nmultis;
     }
 
     // Actually execute the GEMV.
-    void execute(unsigned int start, unsigned int end, int) override
-    {
+    void execute(unsigned int start, unsigned int end, int) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-
         strategy strat(_ci);
 
         /* Break the window values down into multis of interest... */
         const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width);
-        const unsigned int multi_0          = start / window_per_multi;
-        const unsigned int multi_end        = end / window_per_multi;
+        const unsigned int multi_0    = start / window_per_multi;
+        const unsigned int multi_end  = end   / window_per_multi;
 
         /* ... and figure out where we start and end in the first and last multi. */
         const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width;
@@ -101,66 +99,56 @@
 
         static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
 
-        for(unsigned int multi = multi_0; multi <= multi_end; multi++)
-        {
-            const unsigned int n_start = (multi == multi_0) ? n_0 : 0;
-            const unsigned int n_end   = (multi == multi_end) ? n_max : _Nsize;
+        for (unsigned int multi=multi_0; multi<=multi_end; multi++) {
+            const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
+            const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize;
 
-            if(n_end <= n_start)
+            if (n_end <= n_start)
                 continue;
 
-            for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
-            {
+            for (unsigned int m0=0; m0<_Ksize; m0+=m_block) {
                 unsigned int mmax = std::min(m0 + m_block, _Ksize);
-                for(unsigned int n = n_start; n < n_end; n += n_block)
-                {
+
+                for (unsigned int n=n_start; n<n_end; n+=n_block) {
                     unsigned int nmax = std::min(n + n_block, n_end);
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax - m0) * (nmax - n));
+                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
 #endif
                     /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
                     strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave),
                                  (_Ksize * strategy::A_interleave),
                                  this->_Aptr + (multi * this->_A_multi_stride) + m0,
                                  this->_Cptr + (multi * this->_C_multi_stride) + n,
-                                 _beta, (mmax - m0), (nmax - n));
+                                 _beta, (mmax-m0), (nmax-n));
                 }
             }
         }
     }
 
     /* Pretransposed interface implementation */
-    bool B_is_pretransposed() const override
-    {
+    bool B_is_pretransposed() const override {
         return true;
     }
 
-    bool B_pretranspose_required() const override
-    {
+    bool B_pretranspose_required() const override {
         /* Transpose is required if _A_pretransposed is still nullptr */
         return (_A_pretransposed == nullptr);
     }
 
-    size_t get_B_pretransposed_array_size() const override
-    {
+    size_t get_B_pretransposed_array_size() const override {
         return _buffer_per_multi * _nmultis * sizeof(To);
     }
 
-    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override
-    {
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
         Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
 
-        for(unsigned int multi = 0; multi < _nmultis; multi++)
-        {
+        for (unsigned int multi=0; multi<_nmultis; multi++) {
             /* Reverse sense here as we are dealing with B rather than A.  So if
              * strategy::A_transpose is false and _trB is false, we still
              * transpose.  */
-            if(_trB ^ strategy::A_transpose)
-            {
+            if (_trB ^ strategy::A_transpose) {
                 Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
-            }
-            else
-            {
+            } else {
                 Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
             }
         }
@@ -168,8 +156,7 @@
         _A_pretransposed = A_buffer;
     }
 
-    void set_pretransposed_B_data(void *buffer) override
-    {
+    void set_pretransposed_B_data(void *buffer) override {
         _A_pretransposed = reinterpret_cast<Toi *>(buffer);
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index de11dc5..01bf1f9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp

@@ -25,8 +25,8 @@
 
 #ifdef __arm__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a32_sgemm_8x6(const float *, const float *, float *, int, int, int);
 void a32_sgemm_8x6_a53(const float *, const float *, float *, int, int, int);
@@ -40,8 +40,7 @@
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class sgemm_8x6
-{
+class sgemm_8x6 {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -50,25 +49,23 @@
 
     /* Describes the data layout for A input */
     static const int A_interleave = 6;
-    static const int A_block      = 1;
-    static const int A_transpose  = 0;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
 
     /* Same for B input */
     static const int B_interleave = 8;
-    static const int B_block      = 1;
-    static const int B_transpose  = 1;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 8;
+    static const int out_width = 8;
     static const int out_height = 6;
-    static const int k_unroll   = 1;
+    static const int k_unroll = 1;
 
     kern_type kernel = a32_sgemm_8x6;
 
-    sgemm_8x6(const CPUInfo *ci)
-    {
-        switch(ci->get_cpu_model())
-        {
+    sgemm_8x6(const CPUInfo *ci) {
+        switch(ci->get_cpu_model()) {
             case CPUModel::A53:
                 kernel = a32_sgemm_8x6_a53;
                 break;
@@ -78,7 +75,7 @@
                 break;
 
             default:
-                kernel = a32_sgemm_8x6;
+                /* Generic kernel is selected by default. */
                 break;
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
index 428498f..e3844d8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp

@@ -37,360 +37,370 @@
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
-namespace arm_gemm
-{
-void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
-            a_ptr     = a_ptr0;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
             int tails = (K & 3);
-            if(tails == 0)
-            {
+            if (tails == 0) {
                 tails = 4;
             }
-            int k = ((K + 3) / 4) - 1;
+            int k = ((K+3)/4) - 1;
 
-            __asm __volatile(
-                "vmov.i32    q4, #0\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]\n"
-                "vmov.i32    q5, #0\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]\n"
-                "vmov.i32    q6, #0\n"
-                "ldr        r0, [%[a_ptr], #0x10]\n"
-                "vmov.i32    q7, #0\n"
-                "ldr        r1, [%[a_ptr], #0x14]\n"
-                "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32    q9, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32    q10, #0\n" ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32    q11, #0\n"
+            __asm __volatile (
+                "vmov.i32	q4, #0\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]\n"
+                "vmov.i32	q5, #0\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]\n"
+                "vmov.i32	q6, #0\n"
+                "ldr		r0, [%[a_ptr], #0x10]\n"
+                "vmov.i32	q7, #0\n"
+                "ldr		r1, [%[a_ptr], #0x14]\n"
+                "vmov.i32	q8, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x40]")
+                "vmov.i32	q9, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x40]")
+                "vmov.i32	q10, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x80]")
+                "vmov.i32	q11, #0\n"
                 ASM_PREFETCH("[%[b_ptr], #0x80]")
-                "vmov.i32    q12, #0\n"
-                "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32    q14, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]")
-                "vmov.i32    q15, #0\n"
-                "cmp        %[k], #0\n"
-                "beq        6f\n"
+                "vmov.i32	q12, #0\n"
+                "vmov.i32	q13, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0xC0]")
+                "vmov.i32	q14, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0XC0]")
+                "vmov.i32	q15, #0\n"
+                "cmp		%[k], #0\n"
+                "beq		6f\n"
 
                 "1:\n"
                 // Unroll 0
-                "vldr        d6, [%[b_ptr], #0x10]\n"
-                "vmov        d2, r0, r1\n"
-                "vmla.f32    q4, q2, d0[0]\n"
-                "ldr        r0, [%[b_ptr], #0x18]\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "ldr        r1, [%[b_ptr], #0x1C]\n"
-                "vmla.f32    q6, q2, d1[0]\n"
+                "vldr		d6, [%[b_ptr], #0x10]\n"
+                "vmov		d2, r0, r1\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "ldr		r0, [%[b_ptr], #0x18]\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "ldr		r1, [%[b_ptr], #0x1C]\n"
+                "vmla.f32	q6, q2, d1[0]\n"
 
-                "vldr        d3, [%[a_ptr], #0x18]\n"
-                "vmov        d7, r0, r1\n"
-                "vmla.f32    q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x100]")
-                "vmla.f32    q8, q2, d2[0]\n"
-                "vmla.f32    q9, q2, d2[1]\n"
+                "vldr		d3, [%[a_ptr], #0x18]\n"
+                "vmov		d7, r0, r1\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #0x100]")
+                "vmla.f32	q8, q2, d2[0]\n"
+                "vmla.f32	q9, q2, d2[1]\n"
 
-                "vldr        d4, [%[b_ptr], #0x20]\n"
-                "vmla.f32    q10, q3, d0[0]\n"
-                "ldr        r0, [%[b_ptr], #0x28]\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "ldr        r1, [%[b_ptr], #0x2C]\n"
-                "vmla.f32    q12, q3, d1[0]\n"
+                "vldr		d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "ldr		r0, [%[b_ptr], #0x28]\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "ldr		r1, [%[b_ptr], #0x2C]\n"
+                "vmla.f32	q12, q3, d1[0]\n"
 
-                "vldr        d0, [%[a_ptr], #0x20]\n"
-                "vmov        d5, r0, r1\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "ldr        r0, [%[a_ptr], #0x28]\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "ldr        r1, [%[a_ptr], #0x2C]\n"
-                "vmla.f32    q15, q3, d2[1]\n"
+                "vldr		d0, [%[a_ptr], #0x20]\n"
+                "vmov		d5, r0, r1\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "ldr		r0, [%[a_ptr], #0x28]\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "ldr		r1, [%[a_ptr], #0x2C]\n"
+                "vmla.f32	q15, q3, d2[1]\n"
 
                 // Unroll 1
-                "vldr        d6, [%[b_ptr], #0x30]\n"
-                "vmov        d1, r0, r1\n"
-                "vmla.f32    q4, q2, d3[0]\n"
-                "ldr        r0, [%[b_ptr], #0x38]\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "ldr        r1, [%[b_ptr], #0x3C]\n"
-                "vmla.f32    q6, q2, d0[0]\n"
+                "vldr		d6, [%[b_ptr], #0x30]\n"
+                "vmov		d1, r0, r1\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "ldr		r0, [%[b_ptr], #0x38]\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "ldr		r1, [%[b_ptr], #0x3C]\n"
+                "vmla.f32	q6, q2, d0[0]\n"
 
-                "vldr        d2, [%[a_ptr], #0x30]\n"
-                "vmov        d7, r0, r1\n"
-                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x100]")
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vmla.f32    q9, q2, d1[1]\n"
+                "vldr		d2, [%[a_ptr], #0x30]\n"
+                "vmov		d7, r0, r1\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #0x100]")
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vmla.f32	q9, q2, d1[1]\n"
 
-                "vldr        d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32    q10, q3, d3[0]\n"
-                "ldr        r0, [%[b_ptr], #0x48]\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "ldr        r1, [%[b_ptr], #0x4C]\n"
-                "vmla.f32    q12, q3, d0[0]\n"
+                "vldr		d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "ldr		r0, [%[b_ptr], #0x48]\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "ldr		r1, [%[b_ptr], #0x4C]\n"
+                "vmla.f32	q12, q3, d0[0]\n"
 
-                "vldr        d3, [%[a_ptr], #0x38]\n"
-                "vmov        d5, r0, r1\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "ldr        r0, [%[a_ptr], #0x40]\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "ldr        r1, [%[a_ptr], #0x44]\n"
-                "vmla.f32    q15, q3, d1[1]\n"
+                "vldr		d3, [%[a_ptr], #0x38]\n"
+                "vmov		d5, r0, r1\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "ldr		r0, [%[a_ptr], #0x40]\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "ldr		r1, [%[a_ptr], #0x44]\n"
+                "vmla.f32	q15, q3, d1[1]\n"
 
                 // Unroll 2
-                "vldr        d6, [%[b_ptr], #0x50]\n"
-                "vmov        d0, r0, r1\n"
-                "vmla.f32    q4, q2, d2[0]\n"
-                "ldr        r0, [%[b_ptr], #0x58]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "ldr        r1, [%[b_ptr], #0x5C]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
+                "vldr		d6, [%[b_ptr], #0x50]\n"
+                "vmov		d0, r0, r1\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "ldr		r0, [%[b_ptr], #0x58]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "ldr		r1, [%[b_ptr], #0x5C]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
 
-                "vldr        d1, [%[a_ptr], #0x48]\n"
-                "vmov        d7, r0, r1\n"
-                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vmla.f32    q9, q2, d0[1]\n"
+                "vldr		d1, [%[a_ptr], #0x48]\n"
+                "vmov		d7, r0, r1\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #0x140]")
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vmla.f32	q9, q2, d0[1]\n"
 
-                "vldr        d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32    q10, q3, d2[0]\n"
-                "ldr        r0, [%[b_ptr], #0x68]\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "ldr        r1, [%[b_ptr], #0x6C]\n"
-                "vmla.f32    q12, q3, d3[0]\n"
+                "vldr		d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "ldr		r0, [%[b_ptr], #0x68]\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "ldr		r1, [%[b_ptr], #0x6C]\n"
+                "vmla.f32	q12, q3, d3[0]\n"
 
-                "vldr        d2, [%[a_ptr], #0x50]\n"
-                "vmov        d5, r0, r1\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "ldr        r0, [%[a_ptr], #0x58]\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "ldr        r1, [%[a_ptr], #0x5C]\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "add        %[a_ptr], %[a_ptr], #0x60\n"
+                "vldr		d2, [%[a_ptr], #0x50]\n"
+                "vmov		d5, r0, r1\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "ldr		r0, [%[a_ptr], #0x58]\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "ldr		r1, [%[a_ptr], #0x5C]\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "add		%[a_ptr], %[a_ptr], #0x60\n"
 
                 // Unroll 3
-                "vldr        d6, [%[b_ptr], #0x70]\n"
-                "vmov        d3, r0, r1\n"
-                "vmla.f32    q4, q2, d1[0]\n"
-                "ldr        r0, [%[b_ptr], #0x78]\n"
-                "vmla.f32    q5, q2, d1[1]\n"
-                "ldr        r1, [%[b_ptr], #0x7C]\n"
-                "vmla.f32    q6, q2, d2[0]\n"
-                "add        %[b_ptr], %[b_ptr], #0x80\n"
+                "vldr		d6, [%[b_ptr], #0x70]\n"
+                "vmov		d3, r0, r1\n"
+                "vmla.f32	q4, q2, d1[0]\n"
+                "ldr		r0, [%[b_ptr], #0x78]\n"
+                "vmla.f32	q5, q2, d1[1]\n"
+                "ldr		r1, [%[b_ptr], #0x7C]\n"
+                "vmla.f32	q6, q2, d2[0]\n"
+                "add		%[b_ptr], %[b_ptr], #0x80\n"
 
-                "vldr        d0, [%[a_ptr], #0x00]\n"
-                "vmov        d7, r0, r1\n"
-                "vmla.f32    q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0xC0]")
-                "vmla.f32    q8, q2, d3[0]\n"
-                "vmla.f32    q9, q2, d3[1]\n"
+                "vldr		d0, [%[a_ptr], #0x00]\n"
+                "vmov		d7, r0, r1\n"
+                "vmla.f32	q7, q2, d2[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #0xC0]")
+                "vmla.f32	q8, q2, d3[0]\n"
+                "vmla.f32	q9, q2, d3[1]\n"
 
-                "vldr        d4, [%[b_ptr], #0x00]\n"
-                "vmla.f32    q10, q3, d1[0]\n"
-                "ldr        r0, [%[b_ptr], #0x08]\n"
-                "vmla.f32    q11, q3, d1[1]\n"
-                "ldr        r1, [%[b_ptr], #0x0C]\n"
-                "vmla.f32    q12, q3, d2[0]\n"
-                "subs        %[k], %[k], #1\n"
+                "vldr		d4, [%[b_ptr], #0x00]\n"
+                "vmla.f32	q10, q3, d1[0]\n"
+                "ldr		r0, [%[b_ptr], #0x08]\n"
+                "vmla.f32	q11, q3, d1[1]\n"
+                "ldr		r1, [%[b_ptr], #0x0C]\n"
+                "vmla.f32	q12, q3, d2[0]\n"
+                "subs		%[k], %[k], #1\n"
 
-                "vldr        d1, [%[a_ptr], #0x08]\n"
-                "vmov        d5, r0, r1\n"
-                "vmla.f32    q13, q3, d2[1]\n"
-                "ldr        r0, [%[a_ptr], #0x10]\n"
-                "vmla.f32    q14, q3, d3[0]\n"
-                "ldr        r1, [%[a_ptr], #0x14]\n"
-                "vmla.f32    q15, q3, d3[1]\n"
-                "bne        1b\n"
+                "vldr		d1, [%[a_ptr], #0x08]\n"
+                "vmov		d5, r0, r1\n"
+                "vmla.f32	q13, q3, d2[1]\n"
+                "ldr		r0, [%[a_ptr], #0x10]\n"
+                "vmla.f32	q14, q3, d3[0]\n"
+                "ldr		r1, [%[a_ptr], #0x14]\n"
+                "vmla.f32	q15, q3, d3[1]\n"
+                "bne		1b\n"
 
                 // "Tails" shows how many multiply blocks are needed at the
                 // end, must be 1-4 inclusive.  Bail out to alternative tail
                 // immediately if it's 1.
                 "6:\n"
-                "subs        %[tails], %[tails], #1\n"
-                "beq        3f\n"
+                "subs		%[tails], %[tails], #1\n"
+                "beq		3f\n"
 
                 // Detached final iteration - for now adapt the generic
                 // tails rather than reimplementing for A53.
 
                 // Unroll 0
-                "vmov        d2, r0, r1\n"
-                "add        %[a_ptr], %[a_ptr], #0x18\n"
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vld1.32    {d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "add        %[b_ptr], %[b_ptr], #0x10\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "subs        %[tails], %[tails], #1\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmov		d2, r0, r1\n"
+                "add		%[a_ptr], %[a_ptr], #0x18\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vld1.32	{d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "add		%[b_ptr], %[b_ptr], #0x10\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "subs		%[tails], %[tails], #1\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "beq        4f\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "beq		4f\n"
 
                 // Unroll 1
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "subs        %[tails], %[tails], #1\n"
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vmla.f32    q7, q2, d0[1]\n"
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "subs		%[tails], %[tails], #1\n"
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "beq        5f\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "beq		5f\n"
 
                 // Unroll 2
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vmla.f32    q7, q2, d3[1]\n"
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
 
                 // Unroll 3
-                "vmla.f32    q4, q2, d1[0]\n"
-                "vmla.f32    q10, q3, d1[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q5, q2, d1[1]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d1[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d2[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d2[0]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d2[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d2[1]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d3[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d3[0]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d3[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d3[1]\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d1[0]\n"
+                "vmla.f32	q10, q3, d1[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q5, q2, d1[1]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d1[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d2[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d2[0]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d2[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d2[1]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d3[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d3[0]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d3[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d3[1]\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "b		2f\n"
 
                 // tails==1 final tail
                 "3:\n"
-                "vmov        d2, r0, r1\n"
-                "add        %[b_ptr], %[b_ptr], #0x10\n"
-                "vmla.f32    q4, q2, d0[0]\n"
-                "add        %[a_ptr], %[a_ptr], #0x18\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "b        2f\n"
+                "vmov		d2, r0, r1\n"
+                "add		%[b_ptr], %[b_ptr], #0x10\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "add		%[a_ptr], %[a_ptr], #0x18\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "b		2f\n"
 
                 // tails==2 final tail
                 "4:\n"
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d0[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "b		2f\n"
 
                 // tails==3 final tail
                 "5:\n"
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vld1.32    {d0}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d3[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vld1.32	{d0}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
 
                 "2:\n"
-                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
-                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
index 4cfb72a..c5976cf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp

@@ -37,358 +37,375 @@
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
-namespace arm_gemm
-{
-void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
     /* Work out starting values for "k" and "tails" in the inner loop. */
     int tails_initial = (K & 3);
-    if(tails_initial == 0)
-    {
+    if (tails_initial == 0) {
         tails_initial = 4;
     }
 
-    int k_initial = ((K + 3) / 4) - 1;
+    int k_initial = ((K+3)/4) - 1;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             int tails = tails_initial;
-            int k     = k_initial;
+            int k = k_initial;
 
             a_ptr = a_ptr0;
 
-            __asm __volatile(
-                "vldr        d0, [%[a_ptr]]\n"
-                "vmov.i32    q4, #0\n"
-                "vldr        d1, [%[a_ptr], #0x08]\n"
-                "vmov.i32    q5, #0\n"
-                "vldr        d4, [%[b_ptr]]\n"
-                "vmov.i32    q6, #0\n"
-                "vldr        d5, [%[b_ptr], #0x08]\n"
-                "vmov.i32    q7, #0\n"
-                "vldr        d2, [%[a_ptr], #0x10]\n"
-                "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32    q9, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32    q10, #0\n" ASM_PREFETCH("[%[b_ptr], #0x80]") "vmov.i32    q11, #0\n"
-                ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32    q12, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32    q14, #0\n"
-                ASM_PREFETCH("[%[b_ptr], #0x100]") "vmov.i32    q15, #0\n" ASM_PREFETCH("[%[a_ptr], #0x100]") "cmp        %[k], #0\n" ASM_PREFETCH("[%[b_ptr], #0x140]") "beq        6f\n"
+            __asm __volatile (
+                "vldr		d0, [%[a_ptr]]\n"
+                "vmov.i32	q4, #0\n"
+                "vldr		d1, [%[a_ptr], #0x08]\n"
+                "vmov.i32	q5, #0\n"
+                "vldr		d4, [%[b_ptr]]\n"
+                "vmov.i32	q6, #0\n"
+                "vldr		d5, [%[b_ptr], #0x08]\n"
+                "vmov.i32	q7, #0\n"
+                "vldr		d2, [%[a_ptr], #0x10]\n"
+                "vmov.i32	q8, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x40]")
+                "vmov.i32	q9, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x40]")
+                "vmov.i32	q10, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x80]")
+                "vmov.i32	q11, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x80]")
+                "vmov.i32	q12, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0XC0]")
+                "vmov.i32	q13, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0xC0]")
+                "vmov.i32	q14, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x100]")
+                "vmov.i32	q15, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x100]")
+                "cmp		%[k], #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x140]")
+                "beq		6f\n"
                 ASM_PREFETCH("[%[b_ptr], #0x180]")
 
                 "1:\n"
                 // Unroll 0
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vldr        d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vldr        d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vldr        d3, [%[a_ptr], #0x18]\n"
-                "vmla.f32    q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
-                "vmla.f32    q8, q2, d2[0]\n"
-                "subs        %[k], %[k], #1\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vldr        d4, [%[b_ptr], #0x20]\n"
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vldr        d5, [%[b_ptr], #0x28]\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vldr        d0, [%[a_ptr], #0x20]\n"
-                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vldr		d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vldr		d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vldr		d3, [%[a_ptr], #0x18]\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #0x140]")
+                "vmla.f32	q8, q2, d2[0]\n"
+                "subs		%[k], %[k], #1\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vldr		d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vldr		d5, [%[b_ptr], #0x28]\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vldr		d0, [%[a_ptr], #0x20]\n"
+                "vmla.f32	q12, q3, d1[0]\n"
 
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vldr        d1, [%[a_ptr], #0x28]\n"
-                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vldr		d1, [%[a_ptr], #0x28]\n"
+                "vmla.f32	q14, q3, d2[0]\n"
 
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vldr		d6, [%[b_ptr], #0x30]\n"
 
                 // Unroll 1
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vldr        d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "vldr        d2, [%[a_ptr], #0x30]\n"
-                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vldr		d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "vldr		d2, [%[a_ptr], #0x30]\n"
+                "vmla.f32	q6, q2, d0[0]\n"
 
-                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x1C0]")
-                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+                "vmla.f32	q8, q2, d1[0]\n"
 
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vldr        d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vldr        d5, [%[b_ptr], #0x48]\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vldr        d3, [%[a_ptr], #0x38]\n"
-                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vldr		d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vldr		d5, [%[b_ptr], #0x48]\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vldr		d3, [%[a_ptr], #0x38]\n"
+                "vmla.f32	q12, q3, d0[0]\n"
 
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vldr        d0, [%[a_ptr], #0x40]\n"
-                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vldr		d0, [%[a_ptr], #0x40]\n"
+                "vmla.f32	q14, q3, d1[0]\n"
 
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vldr		d6, [%[b_ptr], #0x50]\n"
 
                 // Unroll 2
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vldr        d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vldr        d1, [%[a_ptr], #0x48]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vldr		d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vldr		d1, [%[a_ptr], #0x48]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
 
-                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x180]")
-                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #0x180]")
+                "vmla.f32	q8, q2, d0[0]\n"
 
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vldr        d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vldr        d5, [%[b_ptr], #0x68]\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vldr        d2, [%[a_ptr], #0x50]\n"
-                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vldr		d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vldr		d5, [%[b_ptr], #0x68]\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vldr		d2, [%[a_ptr], #0x50]\n"
+                "vmla.f32	q12, q3, d3[0]\n"
 
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vldr        d3, [%[a_ptr], #0x58]\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "add        %[a_ptr], %[a_ptr], #0x60\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vldr        d6, [%[b_ptr], #0x70]\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vldr		d3, [%[a_ptr], #0x58]\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "add		%[a_ptr], %[a_ptr], #0x60\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vldr		d6, [%[b_ptr], #0x70]\n"
 
                 // Unroll 3
-                "vmla.f32    q4, q2, d1[0]\n"
-                "vldr        d7, [%[b_ptr], #0x78]\n"
-                "vmla.f32    q5, q2, d1[1]\n"
-                "add        %[b_ptr], %[b_ptr], #0x80\n"
-                "vmla.f32    q6, q2, d2[0]\n"
-                "vldr        d0, [%[a_ptr], #0x00]\n"
-                "vmla.f32    q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0x180]")
-                "vmla.f32    q8, q2, d3[0]\n"
+                "vmla.f32	q4, q2, d1[0]\n"
+                "vldr		d7, [%[b_ptr], #0x78]\n"
+                "vmla.f32	q5, q2, d1[1]\n"
+                "add		%[b_ptr], %[b_ptr], #0x80\n"
+                "vmla.f32	q6, q2, d2[0]\n"
+                "vldr		d0, [%[a_ptr], #0x00]\n"
+                "vmla.f32	q7, q2, d2[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #0x180]")
+                "vmla.f32	q8, q2, d3[0]\n"
 
-                "vmla.f32    q9, q2, d3[1]\n"
-                "vldr        d4, [%[b_ptr], #0x00]\n"
-                "vmla.f32    q10, q3, d1[0]\n"
-                "vldr        d5, [%[b_ptr], #0x08]\n"
-                "vmla.f32    q11, q3, d1[1]\n"
-                "vldr        d1, [%[a_ptr], #0x08]\n"
-                "vmla.f32    q12, q3, d2[0]\n"
+                "vmla.f32	q9, q2, d3[1]\n"
+                "vldr		d4, [%[b_ptr], #0x00]\n"
+                "vmla.f32	q10, q3, d1[0]\n"
+                "vldr		d5, [%[b_ptr], #0x08]\n"
+                "vmla.f32	q11, q3, d1[1]\n"
+                "vldr		d1, [%[a_ptr], #0x08]\n"
+                "vmla.f32	q12, q3, d2[0]\n"
 
-                "vmla.f32    q13, q3, d2[1]\n"
-                "vldr        d2, [%[a_ptr], #0x10]\n"
-                "vmla.f32    q14, q3, d3[0]\n"
+                "vmla.f32	q13, q3, d2[1]\n"
+                "vldr		d2, [%[a_ptr], #0x10]\n"
+                "vmla.f32	q14, q3, d3[0]\n"
 
-                "vmla.f32    q15, q3, d3[1]\n"
-                "bne        1b\n"
+                "vmla.f32	q15, q3, d3[1]\n"
+                "bne		1b\n"
 
                 // "Tails" shows how many multiply blocks are needed at the
                 // end, must be 1-4 inclusive.  Bail out to alternative tail
                 // immediately if it's 1.
                 "6:\n"
-                "subs        %[tails], %[tails], #1\n"
-                "beq        3f\n"
+                "subs		%[tails], %[tails], #1\n"
+                "beq		3f\n"
 
                 // Detached final iteration
 
                 // Unroll 0
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vldr        d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vldr        d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vldr        d3, [%[a_ptr], #0x18]\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "subs        %[tails], %[tails], #1\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vldr        d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vldr		d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vldr		d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vldr		d3, [%[a_ptr], #0x18]\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "subs		%[tails], %[tails], #1\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vldr		d4, [%[b_ptr], #0x20]\n"
 
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vldr        d5, [%[b_ptr], #0x28]\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vldr        d0, [%[a_ptr], #0x20]\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "add        %[b_ptr], %[b_ptr], #0x30\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vldr        d1, [%[a_ptr], #0x28]\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "beq        4f\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vldr		d5, [%[b_ptr], #0x28]\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vldr		d0, [%[a_ptr], #0x20]\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "add		%[b_ptr], %[b_ptr], #0x30\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vldr		d1, [%[a_ptr], #0x28]\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "beq		4f\n"
 
                 // Unroll 1
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vldr        d6, [%[b_ptr], #0x30]\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "vldr        d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vldr        d2, [%[a_ptr], #0x30]\n"
-                "vmla.f32    q7, q2, d0[1]\n"
-                "subs        %[tails], %[tails], #1\n"
-                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vldr		d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "vldr		d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vldr		d2, [%[a_ptr], #0x30]\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                "subs		%[tails], %[tails], #1\n"
+                "vmla.f32	q8, q2, d1[0]\n"
 
-                "vmla.f32    q9, q2, d1[1]\n"
+                "vmla.f32	q9, q2, d1[1]\n"
 
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vldr        d4, [%[b_ptr], #0x40]\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vldr        d5, [%[b_ptr], #0x48]\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vldr        d3, [%[a_ptr], #0x38]\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vldr        d0, [%[a_ptr], #0x40]\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "beq        5f\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vldr		d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vldr		d5, [%[b_ptr], #0x48]\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vldr		d3, [%[a_ptr], #0x38]\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vldr		d0, [%[a_ptr], #0x40]\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "beq		5f\n"
 
                 // Unroll 2
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vldr        d6, [%[b_ptr], #0x50]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vldr        d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vldr        d1, [%[a_ptr], #0x48]\n"
-                "vmla.f32    q7, q2, d3[1]\n"
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vmla.f32    q9, q2, d0[1]\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vldr		d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vldr		d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vldr		d1, [%[a_ptr], #0x48]\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vmla.f32	q9, q2, d0[1]\n"
 
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vldr        d4, [%[b_ptr], #0x60]\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vldr        d5, [%[b_ptr], #0x68]\n"
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vldr        d2, [%[a_ptr], #0x50]\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vldr        d3, [%[a_ptr], #0x58]\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vmla.f32    q15, q3, d0[1]\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vldr		d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vldr		d5, [%[b_ptr], #0x68]\n"
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vldr		d2, [%[a_ptr], #0x50]\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vldr		d3, [%[a_ptr], #0x58]\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vmla.f32	q15, q3, d0[1]\n"
 
                 // Unroll 3
-                "vmla.f32    q4, q2, d1[0]\n"
-                "vldr        d6, [%[b_ptr], #0x70]\n"
-                "vmla.f32    q5, q2, d1[1]\n"
-                "vldr        d7, [%[b_ptr], #0x78]\n"
-                "vmla.f32    q10, q3, d1[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d1[1]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d2[0]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d2[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d2[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d2[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d3[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d3[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d3[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d3[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "add        %[a_ptr], %[a_ptr], #0x60\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "add        %[b_ptr], %[b_ptr], #0x80\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d1[0]\n"
+                "vldr		d6, [%[b_ptr], #0x70]\n"
+                "vmla.f32	q5, q2, d1[1]\n"
+                "vldr		d7, [%[b_ptr], #0x78]\n"
+                "vmla.f32	q10, q3, d1[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d1[1]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d2[0]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d2[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d2[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d2[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d3[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d3[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d3[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d3[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "add		%[a_ptr], %[a_ptr], #0x60\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "add		%[b_ptr], %[b_ptr], #0x80\n"
+                "b		2f\n"
 
                 // tails==1 final tail
                 "3:\n"
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vldr        d6, [%[b_ptr], #0x10]\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vldr        d7, [%[b_ptr], #0x18]\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "add        %[a_ptr], %[a_ptr], #0x18\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "add        %[b_ptr], %[b_ptr], #0x20\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vldr		d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vldr		d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "add		%[a_ptr], %[a_ptr], #0x18\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "add		%[b_ptr], %[b_ptr], #0x20\n"
+                "b		2f\n"
 
                 // tails==2 final tail
                 "4:\n"
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vldr        d6, [%[b_ptr], #0x30]\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "vldr        d7, [%[b_ptr], #0x38]\n"
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d0[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "add        %[b_ptr], %[b_ptr], #0x40\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "add        %[a_ptr], %[a_ptr], #0x30\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vldr		d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "vldr		d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "add		%[b_ptr], %[b_ptr], #0x40\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "add		%[a_ptr], %[a_ptr], #0x30\n"
+                "b		2f\n"
 
                 // tails==3 final tail
                 "5:\n"
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vldr        d6, [%[b_ptr], #0x50]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vldr        d7, [%[b_ptr], #0x58]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d3[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "add        %[a_ptr], %[a_ptr], #0x48\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "add        %[b_ptr], %[b_ptr], #0x60\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vldr		d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vldr		d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "add		%[a_ptr], %[a_ptr], #0x48\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "add		%[b_ptr], %[b_ptr], #0x60\n"
 
                 "2:\n"
-                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
-                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
index d7d0484..7b36e8e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp

@@ -37,120 +37,129 @@
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
-namespace arm_gemm
-{
-void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
-            a_ptr     = a_ptr0;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
             int tails = (K & 3);
-            if(tails == 0)
-            {
+            if (tails == 0) {
                 tails = 4;
             }
-            int k = ((K + 3) / 4) - 1;
+            int k = ((K+3)/4) - 1;
 
-            __asm __volatile(
-                "vmov.i32    q4, #0\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmov.i32    q5, #0\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
-                "vmov.i32    q6, #0\n" ASM_PREFETCH("[%[a_ptr], #48]") "vmov.i32    q7, #0\n" ASM_PREFETCH("[%[b_ptr], #48]") "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[a_ptr], #112]") "vmov.i32    q9, #0\n"
+            __asm __volatile (
+                "vmov.i32	q4, #0\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmov.i32	q5, #0\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
+                "vmov.i32	q6, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #48]")
+                "vmov.i32	q7, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #48]")
+                "vmov.i32	q8, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #112]")
+                "vmov.i32	q9, #0\n"
                 ASM_PREFETCH("[%[b_ptr], #112]")
-                "vmov.i32    q10, #0\n"
-                "vmov.i32    q11, #0\n"
-                "vmov.i32    q12, #0\n"
-                "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #176]") "vmov.i32    q14, #0\n" ASM_PREFETCH("[%[b_ptr], #176]")
-                "vmov.i32    q15, #0\n"
+                "vmov.i32	q10, #0\n"
+                "vmov.i32	q11, #0\n"
+                "vmov.i32	q12, #0\n"
+                "vmov.i32	q13, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #176]")
+                "vmov.i32	q14, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #176]")
+                "vmov.i32	q15, #0\n"
 
-                "cmp        %[k], #0\n"
-                "beq        6f\n"
+                "cmp		%[k], #0\n"
+                "beq		6f\n"
 
                 "1:\n"
                 // Unroll 0
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
 
                 // Unroll 1
-                "vmla.f32    q4, q2, d3[0]\n"
-                "subs        %[k], %[k], #1\n"
-                "vmla.f32    q5, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #208]")
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #192]")
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "subs		%[k], %[k], #1\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #208]")
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
 
                 // Unroll 2
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #240]")
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #240]")
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vmla.f32    q11, q3, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #208]")
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #208]")
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
 
                 // Unroll 3
-                "vmla.f32    q4, q2, d1[0]\n"
-                "vmla.f32    q5, q2, d1[1]\n"
-                "vmla.f32    q6, q2, d2[0]\n"
-                "vmla.f32    q7, q2, d2[1]\n"
-                "vmla.f32    q8, q2, d3[0]\n"
-                "vmla.f32    q9, q2, d3[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d1[0]\n"
+                "vmla.f32	q5, q2, d1[1]\n"
+                "vmla.f32	q6, q2, d2[0]\n"
+                "vmla.f32	q7, q2, d2[1]\n"
+                "vmla.f32	q8, q2, d3[0]\n"
+                "vmla.f32	q9, q2, d3[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d1[0]\n"
-                "vmla.f32    q11, q3, d1[1]\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q12, q3, d2[0]\n"
-                "vmla.f32    q13, q3, d2[1]\n"
-                "vmla.f32    q14, q3, d3[0]\n"
-                "vmla.f32    q15, q3, d3[1]\n"
-                "bne        1b\n"
+                "vmla.f32	q10, q3, d1[0]\n"
+                "vmla.f32	q11, q3, d1[1]\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q12, q3, d2[0]\n"
+                "vmla.f32	q13, q3, d2[1]\n"
+                "vmla.f32	q14, q3, d3[0]\n"
+                "vmla.f32	q15, q3, d3[1]\n"
+                "bne		1b\n"
 
                 // Branch here if we never execute main loop.
                 "6:\n"
@@ -158,185 +167,186 @@
                 // "Tails" shows how many multiply blocks are needed at the
                 // end, must be 1-4 inclusive.  Bail out to alternative tail
                 // immediately if it's 1.
-                "subs        %[tails], %[tails], #1\n"
-                "beq        3f\n"
+                "subs		%[tails], %[tails], #1\n"
+                "beq		3f\n"
 
                 // Detached final iteration
                 // Unroll 0
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "subs        %[tails], %[tails], #1\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "subs		%[tails], %[tails], #1\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "beq        4f\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "beq		4f\n"
 
                 // Unroll 1
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "subs        %[tails], %[tails], #1\n"
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vmla.f32    q7, q2, d0[1]\n"
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "subs		%[tails], %[tails], #1\n"
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "beq        5f\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "beq		5f\n"
 
                 // Unroll 2
-                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vmla.f32    q7, q2, d3[1]\n"
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vld1.32	{d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vld1.32	{d4-d5}, [%[b_ptr] :128]!\n"
 
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vld1.32	{d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
 
                 // Unroll 3
-                "vmla.f32    q4, q2, d1[0]\n"
-                "vmla.f32    q10, q3, d1[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q5, q2, d1[1]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d1[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d2[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d2[0]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d2[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d2[1]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d3[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d3[0]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d3[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d3[1]\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d1[0]\n"
+                "vmla.f32	q10, q3, d1[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q5, q2, d1[1]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d1[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d2[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d2[0]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d2[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d2[1]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d3[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d3[0]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d3[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d3[1]\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "b		2f\n"
 
                 // tails==1 final tail
                 "3:\n"
-                "vmla.f32    q4, q2, d0[0]\n"
-                "vld1.32    {d2}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q5, q2, d0[1]\n"
-                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d1[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q10, q3, d0[0]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d0[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d1[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d1[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d1[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d2[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d2[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d2[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d2[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d0[0]\n"
+                "vld1.32	{d2}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q5, q2, d0[1]\n"
+                "vld1.32	{d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d1[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d0[0]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d0[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d1[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d1[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d1[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d2[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d2[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d2[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d2[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "b		2f\n"
 
                 // tails==2 final tail
                 "4:\n"
-                "vmla.f32    q4, q2, d3[0]\n"
-                "vmla.f32    q10, q3, d3[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q5, q2, d3[1]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d3[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q6, q2, d0[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d0[0]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d0[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d0[1]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d1[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d1[0]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d1[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d1[1]\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
-                "b        2f\n"
+                "vmla.f32	q4, q2, d3[0]\n"
+                "vmla.f32	q10, q3, d3[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q5, q2, d3[1]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d3[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q6, q2, d0[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d0[0]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d0[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d0[1]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d1[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d1[0]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d1[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d1[1]\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
+                "b		2f\n"
 
                 // tails==3 final tail
                 "5:\n"
-                "vmla.f32    q4, q2, d2[0]\n"
-                "vld1.32    {d0}, [%[a_ptr] :64]!\n"
-                "vmla.f32    q5, q2, d2[1]\n"
-                "vmla.f32    q6, q2, d3[0]\n"
-                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q10, q3, d2[0]\n"
-                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q11, q3, d2[1]\n"
-                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q12, q3, d3[0]\n"
-                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q7, q2, d3[1]\n"
-                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q13, q3, d3[1]\n"
-                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q8, q2, d0[0]\n"
-                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q14, q3, d0[0]\n"
-                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q9, q2, d0[1]\n"
-                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
-                "vmla.f32    q15, q3, d0[1]\n"
-                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
-                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q4, q2, d2[0]\n"
+                "vld1.32	{d0}, [%[a_ptr] :64]!\n"
+                "vmla.f32	q5, q2, d2[1]\n"
+                "vmla.f32	q6, q2, d3[0]\n"
+                "vst1.32	{d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q10, q3, d2[0]\n"
+                "vst1.32	{d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q11, q3, d2[1]\n"
+                "vst1.32	{d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q12, q3, d3[0]\n"
+                "vst1.32	{d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q7, q2, d3[1]\n"
+                "vst1.32	{d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q13, q3, d3[1]\n"
+                "vst1.32	{d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q8, q2, d0[0]\n"
+                "vst1.32	{d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q14, q3, d0[0]\n"
+                "vst1.32	{d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q9, q2, d0[1]\n"
+                "vst1.32	{d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32	q15, q3, d0[1]\n"
+                "vst1.32	{d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32	{d18-d19}, [%[c_ptr] :128]!\n"
 
                 "2:\n"
-                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
-                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
-                :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc");
+                "vst1.32	{d30-d31}, [%[c_ptr] :128]!\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index 387f899..27700b4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp

@@ -25,8 +25,8 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
 
@@ -38,8 +38,7 @@
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class gemm_s16_12x8
-{
+class gemm_s16_12x8 {
 public:
     typedef int16_t operand_type;
     typedef int32_t result_type;
@@ -48,24 +47,22 @@
 
     /* Describes the data layout for A input */
     static const int A_interleave = 8;
-    static const int A_block      = 1;
-    static const int A_transpose  = 0;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
 
     /* Same for B input */
     static const int B_interleave = 12;
-    static const int B_block      = 1;
-    static const int B_transpose  = 1;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 12;
+    static const int out_width = 12;
     static const int out_height = 8;
-    static const int k_unroll   = 1;
+    static const int k_unroll = 1;
 
     kern_type kernel = a64_gemm_s16_asimd_12x8;
 
-    gemm_s16_12x8(const CPUInfo *ci)
-    {
-    }
+    gemm_s16_12x8(const CPUInfo *ci) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
index b217dcf..823079a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,281 +27,295 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
 {
-    const int16_t *a_ptr = Apanel;
-    int32_t       *c_ptr = Cpanel;
+  const int16_t *a_ptr = Apanel;
+  int32_t *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
+  for (int yb = 0; yb < ablocks; yb++)
+  {
+    const int16_t *a_ptr0 = a_ptr;
+    const int16_t *b_ptr = Bpanel;
+
+    for (int xb = 0; xb < bblocks; xb++)
     {
-        const int16_t *a_ptr0 = a_ptr;
-        const int16_t *b_ptr  = Bpanel;
+      a_ptr = a_ptr0;
+      const bool odd_k = K & 0x1;
+      int k = (K+1)/2 - 1;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
-            a_ptr            = a_ptr0;
-            const bool odd_k = K & 0x1;
-            int        k     = (K + 1) / 2 - 1;
+      register int16x8_t aa asm("v0");
+      register int16x8_t ab asm("v1");
+      register int16x8_t b0 asm("v2");
+      register int16x8_t b1 asm("v3");
+      register int16x8_t b2 asm("v4");
 
-            register int16x8_t aa asm("v0");
-            register int16x8_t ab asm("v1");
-            register int16x8_t b0 asm("v2");
-            register int16x8_t b1 asm("v3");
-            register int16x8_t b2 asm("v4");
+      __asm __volatile (
+        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
+        "movi v5.4s, #0\n"
+        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
+        "movi v6.4s, #0\n"
+        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
+        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
+        "movi v7.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #64]")
+        "movi v8.4s, #0\n"
+        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
+        "movi v9.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #64]")
+        "movi v10.4s, #0\n"
+        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
+        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
+        "movi v11.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #96]")
+        "movi v12.4s, #0\n"
+        "movi v13.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #96]")
+        "movi v14.4s, #0\n"
+        "movi v15.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #128]")
+        "movi v16.4s, #0\n"
+        "movi v17.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #128]")
+        "movi v18.4s, #0\n"
+        "movi v19.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #160]")
+        "movi v20.4s, #0\n"
+        "movi v21.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #160]")
+        "movi v22.4s, #0\n"
+        "movi v23.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #192]")
+        "movi v24.4s, #0\n"
+        "add %x[a_ptr], %x[a_ptr], #0x10\n"
+        "movi v25.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #192]")
+        "movi v26.4s, #0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x18\n"
+        "movi v27.4s, #0\n"
+        "movi v28.4s, #0\n"
 
-            __asm __volatile(
-                "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
-                "movi v5.4s, #0\n"
-                "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
-                "movi v6.4s, #0\n"
-                "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
-                "ins %[aa].d[1], x20\n"     // Merge A[A].lower and upper
-                "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi v8.4s, #0\n"
-                "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
-                "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi v10.4s, #0\n"
-                "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
-                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and upper
-                "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
-                "movi v12.4s, #0\n"
-                "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
-                "movi v14.4s, #0\n"
-                "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi v24.4s, #0\n"
-                "add %x[a_ptr], %x[a_ptr], #0x10\n"
-                "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi v26.4s, #0\n"
-                "add %x[b_ptr], %x[b_ptr], #0x18\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
+        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
 
-                "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+        "1:\n"  // Main loop
+          // First unroll
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
+          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
 
-                "1:\n" // Main loop
-                // First unroll
-                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
-                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
-                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
-                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
-                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
-                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
-                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
-                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-                "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
-                "ins %[b2].d[1], x20\n"            // Merge B[2].lower and .upper
-                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-                "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
-                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          // Second unroll
+          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
+          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
+          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
+          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "add %x[a_ptr], %x[a_ptr], #0x20\n"
+          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          ASM_PREFETCH("[%[b_ptr], #320]")
+          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          ASM_PREFETCH("[%[a_ptr], #320]")
+          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          ASM_PREFETCH("[%[b_ptr], #448]")
+          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "subs %x[k], %x[k], #0x1\n"
+          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
+          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
+          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "add %x[b_ptr], %x[b_ptr], #0x30\n"
+          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "bne 1b\n"
 
-                // Second unroll
-                "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-                "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
-                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and .upper
-                "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-                "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-                "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
-                "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-                "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-                "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-                "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-                "add %x[a_ptr], %x[a_ptr], #0x20\n"
-                "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-                "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
-                "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-                "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-                "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
-                "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-                "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-                "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-                "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-                "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-                "subs %x[k], %x[k], #0x1\n"
-                "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-                "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-                "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
-                "ins %[aa].d[1], x20\n"            // Merge A[A].lower and .upper
-                "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-                "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-                "add %x[b_ptr], %x[b_ptr], #0x30\n"
-                "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-                "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-                "bne 1b\n"
+        "2:\n"  // Even tail
+          "cbnz %x[odd_k], 3f\n"
 
-                "2:\n" // Even tail
-                "cbnz %x[odd_k], 3f\n"
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "add %[a_ptr], %[a_ptr], #0x10\n"
+          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "add %[b_ptr], %[b_ptr], #0x18\n"
+          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
 
-                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
-                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
-                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
-                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
-                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
-                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
-                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
-                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-                "add %[a_ptr], %[a_ptr], #0x10\n"
-                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-                "add %[b_ptr], %[b_ptr], #0x18\n"
-                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-                "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
-                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
+          "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "b 4f\n"  // Complete write out
 
-                "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-                "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-                "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-                "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-                "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-                "str q5, [%x[c_ptr]]\n"
-                "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-                "str q13, [%x[c_ptr], #0x10]\n"
-                "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-                "str q21, [%x[c_ptr], #0x20]\n"
-                "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-                "str q6, [%x[c_ptr], #0x30]\n"
-                "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-                "str q14, [%x[c_ptr], #0x40]\n"
-                "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-                "str q22, [%x[c_ptr], #0x50]\n"
-                "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-                "str q7, [%x[c_ptr], #0x60]\n"
-                "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-                "str q15, [%x[c_ptr], #0x70]\n"
-                "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-                "str q23, [%x[c_ptr], #0x80]\n"
-                "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-                "str q8, [%x[c_ptr], #0x90]\n"
-                "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-                "str q16, [%x[c_ptr], #0xa0]\n"
-                "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-                "str q24, [%x[c_ptr], #0xb0]\n"
-                "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-                "str q9, [%x[c_ptr], #0xc0]\n"
-                "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-                "str q17, [%x[c_ptr], #0xd0]\n"
-                "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-                "str q25, [%x[c_ptr], #0xe0]\n"
-                "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-                "str q10, [%x[c_ptr], #0xf0]\n"
-                "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-                "str q18, [%x[c_ptr], #0x100]\n"
-                "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-                "str q26, [%x[c_ptr], #0x110]\n"
-                "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-                "str q11, [%x[c_ptr], #0x120]\n"
-                "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-                "str q19, [%x[c_ptr], #0x130]\n"
-                "b 4f\n" // Complete write out
+        "3:\n"  // Odd tail
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
 
-                "3:\n" // Odd tail
-                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-                "str q5, [%x[c_ptr]]\n"
-                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-                "str q13, [%x[c_ptr], #0x10]\n"
-                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-                "str q21, [%x[c_ptr], #0x20]\n"
-                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-                "str q6, [%x[c_ptr], #0x30]\n"
-                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-                "str q14, [%x[c_ptr], #0x40]\n"
-                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-                "str q22, [%x[c_ptr], #0x50]\n"
-                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-                "str q7, [%x[c_ptr], #0x60]\n"
-                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-                "str q15, [%x[c_ptr], #0x70]\n"
-                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-                "str q23, [%x[c_ptr], #0x80]\n"
-                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-                "str q8, [%x[c_ptr], #0x90]\n"
-                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-                "str q16, [%x[c_ptr], #0xa0]\n"
-                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-                "str q24, [%x[c_ptr], #0xb0]\n"
-                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-                "str q9, [%x[c_ptr], #0xc0]\n"
-                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-                "str q17, [%x[c_ptr], #0xd0]\n"
-                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-                "str q25, [%x[c_ptr], #0xe0]\n"
-                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-                "str q10, [%x[c_ptr], #0xf0]\n"
-                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-                "str q18, [%x[c_ptr], #0x100]\n"
-                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-                "str q26, [%x[c_ptr], #0x110]\n"
-                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-                "str q11, [%x[c_ptr], #0x120]\n"
-
-                "4:\n" // End of function
-                "str q19, [%x[c_ptr], #0x130]\n"
-                "str q27, [%x[c_ptr], #0x140]\n"
-                "str q12, [%x[c_ptr], #0x150]\n"
-                "str q20, [%x[c_ptr], #0x160]\n"
-                "str q28, [%x[c_ptr], #0x170]\n"
-                "add %x[c_ptr], %x[c_ptr], #0x180\n"
-                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
-                [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
-                : [odd_k] "r"(odd_k)
-                : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
-        }
+        "4:\n"  // End of function
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "str q27, [%x[c_ptr], #0x140]\n"
+          "str q12, [%x[c_ptr], #0x150]\n"
+          "str q20, [%x[c_ptr], #0x160]\n"
+          "str q28, [%x[c_ptr], #0x170]\n"
+          "add %x[c_ptr], %x[c_ptr], #0x180\n"
+        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
+          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
+        : [odd_k] "r" (odd_k)
+        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
+      );
     }
+  }
 }
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index 08f90e1..cb97270 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp

@@ -27,41 +27,38 @@
 
 #include "arm_gemm.hpp"
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Load the actual kernel
 void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
 void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
-class gemm_s8_12x8
-{
+class gemm_s8_12x8 {
 public:
-    typedef int8_t  operand_type;
+    typedef int8_t operand_type;
     typedef int32_t result_type;
 
     typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
     /* Describes the data layout for A input */
-    static const int  A_interleave = 8;
-    static const int  A_block      = 4;
-    static const bool A_transpose  = false;
+    static const int A_interleave = 8;
+    static const int A_block = 4;
+    static const bool A_transpose = false;
 
     /* Same for B input */
-    static const int  B_interleave = 12;
-    static const int  B_block      = 4;
-    static const bool B_transpose  = true;
+    static const int B_interleave = 12;
+    static const int B_block = 4;
+    static const bool B_transpose = true;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 12;
+    static const int out_width = 12;
     static const int out_height = 8;
-    static const int k_unroll   = 4;
+    static const int k_unroll = 4;
 
     kern_type kernel = a64_gemm_s8_12x8;
 
-    gemm_s8_12x8(const CPUInfo *ci)
-    {
-        if(ci->get_cpu_model() == CPUModel::A55r1)
-        {
+    gemm_s8_12x8(const CPUInfo *ci) {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
             kernel = a64_gemm_s8_12x8_a55r1;
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
index ef2f291..eaa7979 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp

@@ -31,40 +31,37 @@
 #include "dot_toolchain_support.h"
 #endif
 
-namespace arm_gemm
-{
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K)
-{
+namespace arm_gemm {
+
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
     const int8_t *a_ptr = Apanel;
-    int32_t      *c_ptr = Cpanel;
+    int32_t *c_ptr = Cpanel;
 
     // We divide K by 4 because the sdot instruction processes 4 elements at a time.
-    const int W = K / 4;
+    const int W = K/4;
 
     // Fix up for odd lengths - set a flag if K is odd, but make
     // sure we round up the iteration count.
-    const int oddk    = (W & 1);
-    const int k_iters = ((W + 1) / 2) - 1;
+    const int oddk = (W & 1);
+    const int k_iters = ((W+1)/2) - 1;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr  = Bpanel;
+        const int8_t *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             int k = k_iters;
 
-            register int32x4_t a0 asm("v0");
-            register int32x4_t a1 asm("v1");
-            register int32x4_t b0 asm("v2");
-            register int32x4_t b1 asm("v3");
-            register int32x4_t b2 asm("v4");
+            register int32x4_t a0  asm("v0");
+            register int32x4_t a1  asm("v1");
+            register int32x4_t b0  asm("v2");
+            register int32x4_t b1  asm("v3");
+            register int32x4_t b2  asm("v4");
             register int32x4_t a0a asm("v5");
             register int32x4_t a1a asm("v6");
 
-            __asm __volatile(
+            __asm __volatile (
 #ifdef NO_DOT_IN_TOOLCHAIN
                 _DECLARE_SDOT
 #else
@@ -79,22 +76,39 @@
                 "ldr    %q[a1], [%[a_ptr], #16]\n"
                 "movi   v11.4s, #0x0\n"
                 "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi   v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi   v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi   v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi   v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
                 "movi   v18.4s, #0x0\n"
-                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
                 "movi   v20.4s, #0x0\n"
-                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
                 "movi   v22.4s, #0x0\n"
-                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v23.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
                 "movi   v24.4s, #0x0\n"
-                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
                 "movi   v26.4s, #0x0\n"
-                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
                 "movi   v28.4s, #0x0\n"
-                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #384]")
                 "movi   v30.4s, #0x0\n"
-                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "movi   v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
 
                 // The loop is offset by these two instructions which must
                 // always be executed.
@@ -105,102 +119,105 @@
                 "cbz    %w[k], 4f\n"
 
                 "1:\n"
-                "sdot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
 
-                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
-                "sdot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #40]\n"
-                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "ins    %[a0a].d[1], x20\n"
-                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #56]\n"
-                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
                 "ins    %[a1a].d[1], x20\n"
-                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #56]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
                 "ins    %[b0].d[1], x20\n"
-                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #72]\n"
-                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #448]")
 
-                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
-                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #576]")
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
 
-                // Unroll 1
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+		// Unroll 1
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
                 "ins    %[b1].d[1], x20\n"
-                "sdot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #88]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
 
-                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
                 "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #72]\n"
-                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
                 "ins    %[a0].d[1], x20\n"
-                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #88]\n"
-                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
 
-                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
                 "ins    %[a1].d[1], x20\n"
-                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #104]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
 
-                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
                 "ins    %[b0].d[1], x20\n"
-                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #120]\n"
-                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
 
-                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
-                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                ASM_PREFETCH("[%[b_ptr], #640]")
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
                 "ins    %[b1].d[1], x20\n"
-                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
                 "ldr    %d[b2], [%[b_ptr], #32]\n"
 
                 "sdot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "b.ne    1b\n"
+                "b.ne	1b\n"
 
                 // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
                 "4:\n"
@@ -212,71 +229,83 @@
                 "cbnz   %w[oddk], 2f\n"
 
                 // Even K continuation
-                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
 
-                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
                 "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #40]\n"
-                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
-                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "ins    %[a0a].d[1], x20\n"
-                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #56]\n"
-                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
                 "ins    %[a1a].d[1], x20\n"
-                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #56]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
-                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
 
-                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
-                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
                 "ins    %[b0].d[1], x20\n"
-                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #72]\n"
-                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
-                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
                 "ins    %[b1].d[1], x20\n"
                 "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #88]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
                 "ins    %[b2].d[1], x20\n"
 
-                "sdot   v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "sdot   v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
                 "sdot   v12.4s, %[b0].16b, %[a1a].4b[0]\n"
                 "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "sdot   v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "sdot   v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
                 "sdot   v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "sdot   v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "sdot   v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
                 "sdot   v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "sdot   v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "sdot   v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
                 "sdot   v19.4s, %[b1].16b, %[a0a].4b[3]\n"
                 "sdot   v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "sdot   v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "sdot   v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
                 "sdot   v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot   v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "sdot   v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
                 "sdot   v24.4s, %[b2].16b, %[a0a].4b[0]\n"
                 "sdot   v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "sdot   v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "sdot   v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
                 "sdot   v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "sdot   v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "sdot   v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
                 "sdot   v29.4s, %[b2].16b, %[a1a].4b[1]\n"
                 "add    %[a_ptr], %[a_ptr], #64\n"
                 "sdot   v30.4s, %[b2].16b, %[a1a].4b[2]\n"
@@ -286,27 +315,41 @@
 
                 // Odd K continuation
                 "2:\n"
-                "sdot   v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "sdot   v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
                 "sdot   v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
-                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
                 "sdot   v14.4s, %[b0].16b, %[a1].4b[2]\n"
                 "add    %[a_ptr], %[a_ptr], #32\n"
-                "sdot   v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "sdot   v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
                 "sdot   v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "add    %[b_ptr], %[b_ptr], #48\n"
-                "sdot   v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "sdot   v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
                 "sdot   v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot   v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "sdot   v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
                 "sdot   v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot   v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "sdot   v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
                 "sdot   v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot   v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "sdot   v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
                 "sdot   v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot   v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "sdot   v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
                 "sdot   v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot   v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "sdot   v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "sdot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]") "sdot   v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "sdot   v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "sdot   v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "sdot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "sdot   v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
                 "sdot   v31.4s, %[b2].16b, %[a1].4b[3]\n"
 
                 // Common tail
@@ -340,13 +383,15 @@
 #ifdef NO_DOT_IN_TOOLCHAIN
                 ".purgem sdot\n"
 #endif
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
index c76f99d..0bc688d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h

@@ -22,45 +22,47 @@
  * SOFTWARE.
  */
 
+
+
 // Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_SDOT                                                                                  \
-    ".altmacro\n"                                                                                      \
-    ".macro sdot opd:req, opn:req, opm:req\n"                                                          \
-    "local vd, vn, vm, h, l\n"                                                                         \
-    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
-    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                                               \
-    ".set vd,\\reg\n"                                                                                  \
-    ".endif\n"                                                                                         \
-    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                                              \
-    ".set vn,\\reg\n"                                                                                  \
-    ".endif\n"                                                                                         \
-    ".irp idx,0,1,2,3\n"                                                                               \
-    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                                      \
-    ".set vm,\\reg\n"                                                                                  \
-    ".set h,\\idx / 2\n"                                                                               \
-    ".set l,\\idx %% 2\n"                                                                              \
-    ".endif\n"                                                                                         \
-    ".endr\n"                                                                                          \
-    ".endr\n"                                                                                          \
-    ".ifndef vd\n"                                                                                     \
-    ".error \"Bad operand \\opd\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef vn\n"                                                                                     \
-    ".error \"Bad operand \\opn\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef vm\n"                                                                                     \
-    ".error \"Bad operand \\opm\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef h\n"                                                                                      \
-    ".error \"Bad operand \\opm\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef l\n"                                                                                      \
-    ".error \"Bad operand \\opm\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".int     0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"                      \
-    ".endm\n"
+#define _DECLARE_SDOT ".altmacro\n"\
+    ".macro sdot opd:req, opn:req, opm:req\n"\
+    "local vd, vn, vm, h, l\n"\
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
+    ".set vd,\\reg\n"\
+    ".endif\n"\
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
+    ".set vn,\\reg\n"\
+    ".endif\n"\
+    ".irp idx,0,1,2,3\n"\
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
+    ".set vm,\\reg\n"\
+    ".set h,\\idx / 2\n"\
+    ".set l,\\idx %% 2\n"\
+    ".endif\n"\
+    ".endr\n"\
+    ".endr\n"\
+    ".ifndef vd\n"\
+    ".error \"Bad operand \\opd\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vn\n"\
+    ".error \"Bad operand \\opn\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vm\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef h\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef l\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".int	 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
+    ".endm\n"\
+

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
index 258ef5e..19225dd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp

@@ -31,309 +31,328 @@
 #include "dot_toolchain_support.h"
 #endif
 
-namespace arm_gemm
-{
-void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
-    int32_t      *c_ptr = Cpanel;
+    int32_t *c_ptr = Cpanel;
     // We divide K by 4 because the sdot instruction processes 4 elements at a time.
-    const int W = K / 4;
+    const int W = K/4;
     // Fix up for odd lengths - set a flag if K is odd, but make
     // sure we round up the iteration count.
-    const int oddk         = (W & 1);
-    const int init_value_k = ((W + 1) / 2) - 1;
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
         const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr  = Bpanel;
-        for(int xb = 0; xb < bblocks; xb++)
-        {
-            a_ptr                = a_ptr0;
-            int                k = init_value_k;
-            register int32x4_t a0 asm("v0");
-            register int32x4_t a1 asm("v1");
-            register int32x4_t b0 asm("v2");
-            register int32x4_t b1 asm("v3");
-            register int32x4_t b2 asm("v4");
+        const int8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register int32x4_t a0  asm("v0");
+            register int32x4_t a1  asm("v1");
+            register int32x4_t b0  asm("v2");
+            register int32x4_t b1  asm("v3");
+            register int32x4_t b2  asm("v4");
             register int32x4_t a0a asm("v5");
             register int32x4_t a1a asm("v6");
-            __asm __volatile(
+            __asm __volatile (
 #ifdef NO_DOT_IN_TOOLCHAIN
                 _DECLARE_SDOT
 #else
                 ".arch  armv8.2-a+dotprod\n"
 #endif
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.4s, #0x0\n"
-                "ldr    %q[a0], [%[a_ptr]]\n"
-                "movi    v9.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.4s, #0x0\n"
-                "ldr    %q[a1], [%[a_ptr], #16]\n"
-                "movi    v11.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
                 ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi    v22.4s, #0x0\n"
-                "movi    v23.4s, #0x0\n"
-                "movi    v24.4s, #0x0\n"
-                "movi    v25.4s, #0x0\n"
-                "movi    v26.4s, #0x0\n"
-                "movi    v27.4s, #0x0\n"
-                "movi    v28.4s, #0x0\n"
-                "movi    v29.4s, #0x0\n"
-                "movi    v30.4s, #0x0\n"
-                "movi    v31.4s, #0x0\n"
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 // Loop proper
                 "1:\n"
-                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "sdot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
 
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %q[a0a], [%[a_ptr], #32]\n"
-                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "sdot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    %q[a1a], [%[a_ptr], #48]\n"
-                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #48]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #64]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
 
-                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
-                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr    %q[b2], [%[b_ptr], #80]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
 
-                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "sdot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    %q[a0], [%[a_ptr], #64]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
                 "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #96]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
-                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #112]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
 
-                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "bne    1b\n"
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne	1b\n"
 
                 // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
                 "4:\n"
 
                 // Branch to alternative tail for odd K
-                "cbnz    %w[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration (even K)
-                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
                 "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %q[a0a], [%[a_ptr], #32]\n"
-                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    %q[a1a], [%[a_ptr], #48]\n"
-                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #48]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #64]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
 
-                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr    %q[b2], [%[b_ptr], #80]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
 
-                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
 
-                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
                 "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "str    q8, [%[c_ptr], #0]\n"
-                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "str    q24, [%[c_ptr], #32]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
 
-                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
-                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
 
-                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
 
-                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str    q12, [%[c_ptr], #192]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
 
                 "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str    q13, [%[c_ptr], #240]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
 
-                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str    q14, [%[c_ptr], #288]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
 
-                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str    q15, [%[c_ptr], #336]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
 
-                "b    3f\n"
+                "b	3f\n"
 
                 // Detached final iteration (odd K)
                 "2:\n"
-                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "sdot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str    q8, [%[c_ptr], #0]\n"
-                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "str    q24, [%[c_ptr], #32]\n"
-                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "sdot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "sdot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "sdot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
 
-                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
+                "sdot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "sdot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "sdot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
 
-                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
+                "sdot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "sdot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "sdot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
 
-                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str    q12, [%[c_ptr], #192]\n"
+                "sdot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "sdot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "sdot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
 
                 "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str    q13, [%[c_ptr], #240]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "sdot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "sdot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
 
-                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str    q14, [%[c_ptr], #288]\n"
+                "sdot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "sdot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "sdot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
 
-                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str    q15, [%[c_ptr], #336]\n"
+                "sdot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "sdot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "sdot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
 
                 // Common tail
                 "3:\n"
-                "str    q23, [%[c_ptr], #352]\n"
-                "str    q31, [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
 
 #ifdef NO_DOT_IN_TOOLCHAIN
                 ".purgem sdot\n"
 #endif
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index 2ec28f4..b5b07b2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp

@@ -25,43 +25,41 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Load the actual kernel
 void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
 #include "arm_gemm.hpp"
 
-class gemm_s8_4x4
-{
+class gemm_s8_4x4 {
 public:
-    typedef int8_t  operand_type;
+    typedef int8_t operand_type;
     typedef int32_t result_type;
 
     typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
     /* Describes the data layout for A input */
-    static const int  A_interleave = 4;
-    static const int  A_block      = 16;
-    static const bool A_transpose  = false;
+    static const int A_interleave = 4;
+    static const int A_block = 16;
+    static const bool A_transpose = false;
 
     /* Same for B input */
-    static const int  B_interleave = 4;
-    static const int  B_block      = 16;
-    static const bool B_transpose  = true;
+    static const int B_interleave = 4;
+    static const int B_block = 16;
+    static const bool B_transpose = true;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 4;
+    static const int out_width = 4;
     static const int out_height = 4;
-    static const int k_unroll   = 16;
+    static const int k_unroll = 16;
 
-    kern_type kernel = a64_gemm_s8_4x4;
+    kern_type kernel=a64_gemm_s8_4x4;
 
-    gemm_s8_4x4(const CPUInfo *ci)
-    {
-    }
+    gemm_s8_4x4(const CPUInfo *ci) { }
 };
 
 } // namespace arm_gemm
 
 #endif // __aarch64__
+

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
index 243b94e..2fc54f8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,56 +27,66 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
-void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
     const int8_t *a_ptr = Apanel;
-    int32_t      *c_ptr = Cpanel;
+    int32_t *c_ptr = Cpanel;
 
     K /= 16;
     int oddk = (K & 1);
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const int8_t *a_ptr0 = a_ptr;
-        const int8_t *b_ptr  = Bpanel;
+        const int8_t *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
 
-            int k = ((K + 1) / 2) - 1;
+            int k = ((K+1)/2)-1;
 
-            register int8x16_t b0 asm("v4");
-            register int8x16_t b1 asm("v5");
-            register int8x16_t b2 asm("v6");
-            register int8x16_t b3 asm("v7");
+            register int8x16_t b0  asm("v4");
+            register int8x16_t b1  asm("v5");
+            register int8x16_t b2  asm("v6");
+            register int8x16_t b3  asm("v7");
             register int8x16_t b0a asm("v8");
             register int8x16_t b1a asm("v9");
             register int8x16_t b2a asm("v10");
             register int8x16_t b3a asm("v11");
 
-            __asm __volatile(
-                "movi    v16.4s, #0x0\n"
-                "ldr    q0, [%[a_ptr]]\n"
-                "movi    v17.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v18.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v19.4s, #0x0\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "movi    v20.4s, #0x0\n"
-                "ldr    %q[b3], [%[b_ptr], #48]\n"
-                "movi    v21.4s, #0x0\n"
-                "ldr    q1, [%[a_ptr], #16]\n"
-                "movi    v22.4s, #0x0\n"
-                "ldr    q2, [%[a_ptr], #32]\n"
-                "movi    v23.4s, #0x0\n"
-                "ldr    q3, [%[a_ptr], #48]\n"
-                "movi    v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v27.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi    v30.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]") "movi    v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+            __asm __volatile (
+                "movi	v16.4s, #0x0\n"
+                "ldr	q0, [%[a_ptr]]\n"
+                "movi	v17.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v18.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v19.4s, #0x0\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "movi	v20.4s, #0x0\n"
+                "ldr	%q[b3], [%[b_ptr], #48]\n"
+                "movi	v21.4s, #0x0\n"
+                "ldr	q1, [%[a_ptr], #16]\n"
+                "movi	v22.4s, #0x0\n"
+                "ldr	q2, [%[a_ptr], #32]\n"
+                "movi	v23.4s, #0x0\n"
+                "ldr	q3, [%[a_ptr], #48]\n"
+                "movi	v24.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v26.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v28.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
 
                 // Loop structure optimized for A57 (after r0).
 
@@ -97,356 +107,360 @@
                 // of multiplies that need to be pulled out.
 
                 // Start of unroll 0 (first iteration)
-                "smull    v12.8h, v0.8b, %[b0].8b\n"
-                "smull    v13.8h, v0.8b, %[b1].8b\n"
+                "smull	v12.8h, v0.8b, %[b0].8b\n"
+                "smull	v13.8h, v0.8b, %[b1].8b\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 // Unroll 0 continuation (branch target)
                 "1:\n"
-                "smull    v14.8h, v0.8b, %[b2].8b\n"
-                "subs    %w[k], %w[k], #1\n"
-                "smull    v15.8h, v0.8b, %[b3].8b\n"
-                "ldr    %q[b0a], [%[b_ptr], #64]\n"
-                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
-                "ldr    %q[b1a], [%[b_ptr], #80]\n"
-                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
-                "ldr     q0, [%[a_ptr], #64]\n"
+                "smull	v14.8h, v0.8b, %[b2].8b\n"
+                "subs	%w[k], %w[k], #1\n"
+                "smull	v15.8h, v0.8b, %[b3].8b\n"
+                "ldr	%q[b0a], [%[b_ptr], #64]\n"
+                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "ldr	%q[b1a], [%[b_ptr], #80]\n"
+                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr], #64]\n"
 
-                "sadalp    v16.4s, v12.8h\n"
-                "smull    v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp    v17.4s, v13.8h\n"
-                "sadalp    v18.4s, v14.8h\n"
-                "smull    v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp    v19.4s, v15.8h\n"
-                "smull    v14.8h, v1.8b, %[b2].8b\n"
-                "ldr    %q[b2a], [%[b_ptr], #96]\n"
-                "smull    v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
-                "ldr    %q[b3a], [%[b_ptr], #112]\n"
-                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
-                "add    %[b_ptr], %[b_ptr], #128\n"
-                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
-                "ldr     q1, [%[a_ptr], #80]\n"
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2].8b\n"
+                "ldr	%q[b2a], [%[b_ptr], #96]\n"
+                "smull	v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "ldr	%q[b3a], [%[b_ptr], #112]\n"
+                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "add	%[b_ptr], %[b_ptr], #128\n"
+                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #80]\n"
 
-                "sadalp    v20.4s, v12.8h\n"
-                "smull    v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp    v21.4s, v13.8h\n"
-                "sadalp    v22.4s, v14.8h\n"
-                "smull    v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp    v23.4s, v15.8h\n"
-                "smull    v14.8h, v2.8b, %[b2].8b\n"
-                "smull    v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v2.16b, %[b0].16b\n" ASM_PREFETCH("[%[b_ptr], #192]")
-                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v2.16b, %[b2].16b\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
-                "ldr     q2, [%[a_ptr], #96]\n"
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "smull	v14.8h, v2.8b, %[b2].8b\n"
+                "smull	v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr 	q2, [%[a_ptr], #96]\n"
 
-                "sadalp    v24.4s, v12.8h\n"
-                "smull    v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp    v25.4s, v13.8h\n"
-                "sadalp    v26.4s, v14.8h\n"
-                "smull    v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp    v27.4s, v15.8h\n"
-                "smull    v14.8h, v3.8b, %[b2].8b\n"
-                "smull    v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
-                "ldr     %q[b0], [%[b_ptr], #0]\n"
-                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
-                "ldr     q3, [%[a_ptr], #112]\n"
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "smull	v14.8h, v3.8b, %[b2].8b\n"
+                "smull	v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "ldr 	%q[b0], [%[b_ptr], #0]\n"
+                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr 	q3, [%[a_ptr], #112]\n"
 
                 // Unroll 1
-                "sadalp    v28.4s, v12.8h\n"
-                "smull    v12.8h, v0.8b, %[b0a].8b\n"
-                "sadalp    v29.4s, v13.8h\n"
-                "sadalp    v30.4s, v14.8h\n"
-                "smull    v13.8h, v0.8b, %[b1a].8b\n"
-                "sadalp    v31.4s, v15.8h\n"
-                "smull    v14.8h, v0.8b, %[b2a].8b\n"
-                "smull    v15.8h, v0.8b, %[b3a].8b\n"
-                "ldr     %q[b1], [%[b_ptr], #16]\n"
-                "smlal2    v12.8h, v0.16b, %[b0a].16b\n"
-                "smlal2    v13.8h, v0.16b, %[b1a].16b\n"
-                "ldr     %q[b2], [%[b_ptr], #32]\n"
-                "smlal2    v14.8h, v0.16b, %[b2a].16b\n"
-                "smlal2    v15.8h, v0.16b, %[b3a].16b\n"
-                "ldr     q0, [%[a_ptr], #128]\n"
+                "sadalp	v28.4s, v12.8h\n"
+                "smull	v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "smull	v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp	v31.4s, v15.8h\n"
+                "smull	v14.8h, v0.8b, %[b2a].8b\n"
+                "smull	v15.8h, v0.8b, %[b3a].8b\n"
+                "ldr 	%q[b1], [%[b_ptr], #16]\n"
+                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
+                "ldr 	%q[b2], [%[b_ptr], #32]\n"
+                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
+                "ldr 	q0, [%[a_ptr], #128]\n"
 
-                "sadalp    v16.4s, v12.8h\n"
-                "smull    v12.8h, v1.8b, %[b0a].8b\n"
-                "sadalp    v17.4s, v13.8h\n"
-                "sadalp    v18.4s, v14.8h\n"
-                "smull    v13.8h, v1.8b, %[b1a].8b\n"
-                "sadalp    v19.4s, v15.8h\n"
-                "add    %[a_ptr], %[a_ptr], #128\n"
-                "smull    v14.8h, v1.8b, %[b2a].8b\n"
-                "smull    v15.8h, v1.8b, %[b3a].8b\n"
-                "ldr     %q[b3], [%[b_ptr], #48]\n"
-                "smlal2    v12.8h, v1.16b, %[b0a].16b\n"
-                "smlal2    v13.8h, v1.16b, %[b1a].16b\n"
-                "smlal2    v14.8h, v1.16b, %[b2a].16b\n"
-                "smlal2    v15.8h, v1.16b, %[b3a].16b\n"
-                "ldr     q1, [%[a_ptr], #16]\n"
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "smull	v14.8h, v1.8b, %[b2a].8b\n"
+                "smull	v15.8h, v1.8b, %[b3a].8b\n"
+                "ldr 	%q[b3], [%[b_ptr], #48]\n"
+                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
+                "ldr 	q1, [%[a_ptr], #16]\n"
 
-                "sadalp    v20.4s, v12.8h\n"
-                "smull    v12.8h, v2.8b, %[b0a].8b\n"
-                "sadalp    v21.4s, v13.8h\n"
-                "sadalp    v22.4s, v14.8h\n"
-                "smull    v13.8h, v2.8b, %[b1a].8b\n"
-                "sadalp    v23.4s, v15.8h\n"
-                "smull    v14.8h, v2.8b, %[b2a].8b\n"
-                "smull    v15.8h, v2.8b, %[b3a].8b\n"
-                "smlal2    v12.8h, v2.16b, %[b0a].16b\n" ASM_PREFETCH("[%[b_ptr], #256]")
-                "smlal2    v13.8h, v2.16b, %[b1a].16b\n"
-                "smlal2    v14.8h, v2.16b, %[b2a].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
-                "smlal2    v15.8h, v2.16b, %[b3a].16b\n"
-                "ldr     q2, [%[a_ptr], #32]\n"
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "smull	v14.8h, v2.8b, %[b2a].8b\n"
+                "smull	v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
+                "ldr 	q2, [%[a_ptr], #32]\n"
 
-                "sadalp    v24.4s, v12.8h\n"
-                "smull    v12.8h, v3.8b, %[b0a].8b\n"
-                "sadalp    v25.4s, v13.8h\n"
-                "sadalp    v26.4s, v14.8h\n"
-                "smull    v13.8h, v3.8b, %[b1a].8b\n"
-                "sadalp    v27.4s, v15.8h\n"
-                "smull    v14.8h, v3.8b, %[b2a].8b\n"
-                "smull    v15.8h, v3.8b, %[b3a].8b\n"
-                "smlal2    v12.8h, v3.16b, %[b0a].16b\n"
-                "smlal2    v13.8h, v3.16b, %[b1a].16b\n"
-                "smlal2    v14.8h, v3.16b, %[b2a].16b\n"
-                "smlal2    v15.8h, v3.16b, %[b3a].16b\n"
-                "ldr     q3, [%[a_ptr], #48]\n"
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "smull	v14.8h, v3.8b, %[b2a].8b\n"
+                "smull	v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
+                "ldr 	q3, [%[a_ptr], #48]\n"
 
                 // Start of unroll 0 for next iteration.
-                "sadalp    v28.4s, v12.8h\n"
-                "smull    v12.8h, v0.8b, %[b0].8b\n"
-                "sadalp    v29.4s, v13.8h\n"
-                "sadalp    v30.4s, v14.8h\n"
-                "smull    v13.8h, v0.8b, %[b1].8b\n"
-                "sadalp    v31.4s, v15.8h\n"
-                "bne    1b\n"
+                "sadalp	v28.4s, v12.8h\n"
+                "smull	v12.8h, v0.8b, %[b0].8b\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "smull	v13.8h, v0.8b, %[b1].8b\n"
+                "sadalp	v31.4s, v15.8h\n"
+                "bne	1b\n"
 
                 // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
                 "4:\n"
 
                 // Branch to alternative tail for odd K
-                "cbnz    %w[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration (even K)
-                "smull    v14.8h, v0.8b, %[b2].8b\n"
-                "smull    v15.8h, v0.8b, %[b3].8b\n"
-                "ldr    %q[b0a], [%[b_ptr], #64]\n"
-                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
-                "ldr    %q[b1a], [%[b_ptr], #80]\n"
-                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
-                "ldr     q0, [%[a_ptr], #64]\n"
+                "smull	v14.8h, v0.8b, %[b2].8b\n"
+                "smull	v15.8h, v0.8b, %[b3].8b\n"
+                "ldr	%q[b0a], [%[b_ptr], #64]\n"
+                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "ldr	%q[b1a], [%[b_ptr], #80]\n"
+                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr], #64]\n"
 
-                "sadalp    v16.4s, v12.8h\n"
-                "smull    v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp    v17.4s, v13.8h\n"
-                "sadalp    v18.4s, v14.8h\n"
-                "smull    v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp    v19.4s, v15.8h\n"
-                "smull    v14.8h, v1.8b, %[b2].8b\n"
-                "ldr    %q[b2a], [%[b_ptr], #96]\n"
-                "smull    v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
-                "ldr    %q[b3a], [%[b_ptr], #112]\n"
-                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
-                "add    %[b_ptr], %[b_ptr], #128\n"
-                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
-                "ldr     q1, [%[a_ptr], #80]\n"
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2].8b\n"
+                "ldr	%q[b2a], [%[b_ptr], #96]\n"
+                "smull	v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "ldr	%q[b3a], [%[b_ptr], #112]\n"
+                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "add	%[b_ptr], %[b_ptr], #128\n"
+                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #80]\n"
 
-                "sadalp    v20.4s, v12.8h\n"
-                "smull    v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp    v21.4s, v13.8h\n"
-                "sadalp    v22.4s, v14.8h\n"
-                "smull    v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp    v23.4s, v15.8h\n"
-                "smull    v14.8h, v2.8b, %[b2].8b\n"
-                "smull    v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v2.16b, %[b0].16b\n"
-                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v2.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
-                "ldr     q2, [%[a_ptr], #96]\n"
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "smull	v14.8h, v2.8b, %[b2].8b\n"
+                "smull	v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr 	q2, [%[a_ptr], #96]\n"
 
-                "sadalp    v24.4s, v12.8h\n"
-                "smull    v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp    v25.4s, v13.8h\n"
-                "sadalp    v26.4s, v14.8h\n"
-                "smull    v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp    v27.4s, v15.8h\n"
-                "smull    v14.8h, v3.8b, %[b2].8b\n"
-                "smull    v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
-                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
-                "ldr     q3, [%[a_ptr], #112]\n"
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "smull	v14.8h, v3.8b, %[b2].8b\n"
+                "smull	v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr 	q3, [%[a_ptr], #112]\n"
 
                 // Unroll 1
-                "sadalp    v28.4s, v12.8h\n"
-                "smull    v12.8h, v0.8b, %[b0a].8b\n"
-                "sadalp    v29.4s, v13.8h\n"
-                "sadalp    v30.4s, v14.8h\n"
-                "smull    v13.8h, v0.8b, %[b1a].8b\n"
-                "sadalp    v31.4s, v15.8h\n"
-                "smull    v14.8h, v0.8b, %[b2a].8b\n"
-                "add    %[a_ptr], %[a_ptr], #128\n"
-                "smull    v15.8h, v0.8b, %[b3a].8b\n"
-                "smlal2    v12.8h, v0.16b, %[b0a].16b\n"
-                "smlal2    v13.8h, v0.16b, %[b1a].16b\n"
-                "smlal2    v14.8h, v0.16b, %[b2a].16b\n"
-                "smlal2    v15.8h, v0.16b, %[b3a].16b\n"
+                "sadalp	v28.4s, v12.8h\n"
+                "smull	v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "smull	v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp	v31.4s, v15.8h\n"
+                "smull	v14.8h, v0.8b, %[b2a].8b\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "smull	v15.8h, v0.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
 
-                "sadalp    v16.4s, v12.8h\n"
-                "smull    v12.8h, v1.8b, %[b0a].8b\n"
-                "sadalp    v17.4s, v13.8h\n"
-                "sadalp    v18.4s, v14.8h\n"
-                "smull    v13.8h, v1.8b, %[b1a].8b\n"
-                "sadalp    v19.4s, v15.8h\n"
-                "smull    v14.8h, v1.8b, %[b2a].8b\n"
-                "smull    v15.8h, v1.8b, %[b3a].8b\n"
-                "smlal2    v12.8h, v1.16b, %[b0a].16b\n"
-                "addp    v16.4s, v16.4s, v17.4s\n"
-                "smlal2    v13.8h, v1.16b, %[b1a].16b\n"
-                "addp    v17.4s, v18.4s, v19.4s\n"
-                "smlal2    v14.8h, v1.16b, %[b2a].16b\n"
-                "smlal2    v15.8h, v1.16b, %[b3a].16b\n"
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2a].8b\n"
+                "smull	v15.8h, v1.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
 
-                "sadalp    v20.4s, v12.8h\n"
-                "smull    v12.8h, v2.8b, %[b0a].8b\n"
-                "sadalp    v21.4s, v13.8h\n"
-                "sadalp    v22.4s, v14.8h\n"
-                "smull    v13.8h, v2.8b, %[b1a].8b\n"
-                "sadalp    v23.4s, v15.8h\n"
-                "addp    v16.4s, v16.4s, v17.4s\n"
-                "smull    v14.8h, v2.8b, %[b2a].8b\n"
-                "addp    v18.4s, v20.4s, v21.4s\n"
-                "addp    v19.4s, v22.4s, v23.4s\n"
-                "smull    v15.8h, v2.8b, %[b3a].8b\n"
-                "smlal2    v12.8h, v2.16b, %[b0a].16b\n"
-                "str    q16, [%[c_ptr]]\n"
-                "smlal2    v13.8h, v2.16b, %[b1a].16b\n"
-                "smlal2    v14.8h, v2.16b, %[b2a].16b\n"
-                "smlal2    v15.8h, v2.16b, %[b3a].16b\n"
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smull	v14.8h, v2.8b, %[b2a].8b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "smull	v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
+                "str	q16, [%[c_ptr]]\n"
+                "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
+                "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
 
-                "sadalp    v24.4s, v12.8h\n"
-                "smull    v12.8h, v3.8b, %[b0a].8b\n"
-                "sadalp    v25.4s, v13.8h\n"
-                "sadalp    v26.4s, v14.8h\n"
-                "smull    v13.8h, v3.8b, %[b1a].8b\n"
-                "sadalp    v27.4s, v15.8h\n"
-                "addp    v17.4s, v18.4s, v19.4s\n"
-                "smull    v14.8h, v3.8b, %[b2a].8b\n"
-                "addp    v20.4s, v24.4s, v25.4s\n"
-                "addp    v21.4s, v26.4s, v27.4s\n"
-                "smull    v15.8h, v3.8b, %[b3a].8b\n"
-                "smlal2    v12.8h, v3.16b, %[b0a].16b\n"
-                "str    q17, [%[c_ptr], #16]\n"
-                "smlal2    v13.8h, v3.16b, %[b1a].16b\n"
-                "smlal2    v14.8h, v3.16b, %[b2a].16b\n"
-                "addp    v18.4s, v20.4s, v21.4s\n"
-                "smlal2    v15.8h, v3.16b, %[b3a].16b\n"
-                "b    3f\n"
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smull	v14.8h, v3.8b, %[b2a].8b\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "smull	v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
+                "b	3f\n"
 
                 // Detached final iteration (odd K)
                 "2:\n"
-                "smull    v14.8h, v0.8b, %[b2].8b\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "smull    v15.8h, v0.8b, %[b3].8b\n"
-                "add    %[b_ptr], %[b_ptr], #64\n"
-                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
-                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+                "smull	v14.8h, v0.8b, %[b2].8b\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "smull	v15.8h, v0.8b, %[b3].8b\n"
+                "add	%[b_ptr], %[b_ptr], #64\n"
+                "smlal2	v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2	v13.8h, v0.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v0.16b, %[b3].16b\n"
 
-                "sadalp    v16.4s, v12.8h\n"
-                "smull    v12.8h, v1.8b, %[b0].8b\n"
-                "sadalp    v17.4s, v13.8h\n"
-                "sadalp    v18.4s, v14.8h\n"
-                "smull    v13.8h, v1.8b, %[b1].8b\n"
-                "sadalp    v19.4s, v15.8h\n"
-                "smull    v14.8h, v1.8b, %[b2].8b\n"
-                "smull    v15.8h, v1.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
-                "addp    v16.4s, v16.4s, v17.4s\n"
-                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
-                "addp    v17.4s, v18.4s, v19.4s\n"
-                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+                "sadalp	v16.4s, v12.8h\n"
+                "smull	v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp	v17.4s, v13.8h\n"
+                "sadalp	v18.4s, v14.8h\n"
+                "smull	v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp	v19.4s, v15.8h\n"
+                "smull	v14.8h, v1.8b, %[b2].8b\n"
+                "smull	v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v1.16b, %[b0].16b\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smlal2	v13.8h, v1.16b, %[b1].16b\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smlal2	v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v1.16b, %[b3].16b\n"
 
-                "sadalp    v20.4s, v12.8h\n"
-                "smull    v12.8h, v2.8b, %[b0].8b\n"
-                "sadalp    v21.4s, v13.8h\n"
-                "sadalp    v22.4s, v14.8h\n"
-                "smull    v13.8h, v2.8b, %[b1].8b\n"
-                "sadalp    v23.4s, v15.8h\n"
-                "addp    v16.4s, v16.4s, v17.4s\n"
-                "smull    v14.8h, v2.8b, %[b2].8b\n"
-                "addp    v18.4s, v20.4s, v21.4s\n"
-                "addp    v19.4s, v22.4s, v23.4s\n"
-                "smull    v15.8h, v2.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v2.16b, %[b0].16b\n"
-                "str    q16, [%[c_ptr]]\n"
-                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v2.16b, %[b2].16b\n"
-                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+                "sadalp	v20.4s, v12.8h\n"
+                "smull	v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp	v21.4s, v13.8h\n"
+                "sadalp	v22.4s, v14.8h\n"
+                "smull	v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp	v23.4s, v15.8h\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "smull	v14.8h, v2.8b, %[b2].8b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "smull	v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v2.16b, %[b0].16b\n"
+                "str	q16, [%[c_ptr]]\n"
+                "smlal2	v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2	v15.8h, v2.16b, %[b3].16b\n"
 
-                "sadalp    v24.4s, v12.8h\n"
-                "smull    v12.8h, v3.8b, %[b0].8b\n"
-                "sadalp    v25.4s, v13.8h\n"
-                "sadalp    v26.4s, v14.8h\n"
-                "smull    v13.8h, v3.8b, %[b1].8b\n"
-                "sadalp    v27.4s, v15.8h\n"
-                "addp    v17.4s, v18.4s, v19.4s\n"
-                "smull    v14.8h, v3.8b, %[b2].8b\n"
-                "addp    v20.4s, v24.4s, v25.4s\n"
-                "addp    v21.4s, v26.4s, v27.4s\n"
-                "smull    v15.8h, v3.8b, %[b3].8b\n"
-                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
-                "str    q17, [%[c_ptr], #16]\n"
-                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
-                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
-                "addp    v18.4s, v20.4s, v21.4s\n"
-                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+                "sadalp	v24.4s, v12.8h\n"
+                "smull	v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp	v25.4s, v13.8h\n"
+                "sadalp	v26.4s, v14.8h\n"
+                "smull	v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp	v27.4s, v15.8h\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "smull	v14.8h, v3.8b, %[b2].8b\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "smull	v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2	v12.8h, v3.16b, %[b0].16b\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "smlal2	v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2	v14.8h, v3.16b, %[b2].16b\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "smlal2	v15.8h, v3.16b, %[b3].16b\n"
 
                 "3:\n"
 
                 // Final additions
-                "sadalp    v28.4s, v12.8h\n"
-                "str    q18, [%[c_ptr], #32]\n"
-                "sadalp    v29.4s, v13.8h\n"
-                "sadalp    v30.4s, v14.8h\n"
-                "sadalp    v31.4s, v15.8h\n"
+                "sadalp	v28.4s, v12.8h\n"
+                "str	q18, [%[c_ptr], #32]\n"
+                "sadalp	v29.4s, v13.8h\n"
+                "sadalp	v30.4s, v14.8h\n"
+                "sadalp	v31.4s, v15.8h\n"
 
                 // Horizontal reduction, phase 1
-                "addp    v22.4s, v28.4s, v29.4s\n"
-                "addp    v23.4s, v30.4s, v31.4s\n"
+                "addp	v22.4s, v28.4s, v29.4s\n"
+                "addp	v23.4s, v30.4s, v31.4s\n"
 
                 // Horizontal reduction, phase 2
-                "addp    v19.4s, v22.4s, v23.4s\n"
-                "str    q19, [%[c_ptr], #48]\n"
-                "add    %[c_ptr], %[c_ptr], #64\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "str	q19, [%[c_ptr], #48]\n"
+                "add	%[c_ptr], %[c_ptr], #64\n"
 
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
-                [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a), [b3a] "+w"(b3a),
-                [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-                "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
+              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
+              [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
+              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index 3975732..13dd570 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp

@@ -25,8 +25,8 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
 
@@ -38,8 +38,7 @@
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class gemm_u16_12x8
-{
+class gemm_u16_12x8 {
 public:
     typedef uint16_t operand_type;
     typedef uint32_t result_type;
@@ -48,24 +47,22 @@
 
     /* Describes the data layout for A input */
     static const int A_interleave = 8;
-    static const int A_block      = 1;
-    static const int A_transpose  = 0;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
 
     /* Same for B input */
     static const int B_interleave = 12;
-    static const int B_block      = 1;
-    static const int B_transpose  = 1;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 12;
+    static const int out_width = 12;
     static const int out_height = 8;
-    static const int k_unroll   = 1;
+    static const int k_unroll = 1;
 
     kern_type kernel = a64_gemm_u16_asimd_12x8;
 
-    gemm_u16_12x8(const CPUInfo *ci)
-    {
-    }
+    gemm_u16_12x8(const CPUInfo *ci) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
index 7903878..4c21620 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp

@@ -27,281 +27,295 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
 {
-    const uint16_t *a_ptr = Apanel;
-    uint32_t       *c_ptr = Cpanel;
+  const uint16_t *a_ptr = Apanel;
+  uint32_t *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
+  for (int yb = 0; yb < ablocks; yb++)
+  {
+    const uint16_t *a_ptr0 = a_ptr;
+    const uint16_t *b_ptr = Bpanel;
+
+    for (int xb = 0; xb < bblocks; xb++)
     {
-        const uint16_t *a_ptr0 = a_ptr;
-        const uint16_t *b_ptr  = Bpanel;
+      a_ptr = a_ptr0;
+      const bool odd_k = K & 0x1;
+      int k = (K+1)/2 - 1;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
-            a_ptr            = a_ptr0;
-            const bool odd_k = K & 0x1;
-            int        k     = (K + 1) / 2 - 1;
+      register uint16x8_t aa asm("v0");
+      register uint16x8_t ab asm("v1");
+      register uint16x8_t b0 asm("v2");
+      register uint16x8_t b1 asm("v3");
+      register uint16x8_t b2 asm("v4");
 
-            register uint16x8_t aa asm("v0");
-            register uint16x8_t ab asm("v1");
-            register uint16x8_t b0 asm("v2");
-            register uint16x8_t b1 asm("v3");
-            register uint16x8_t b2 asm("v4");
+      __asm __volatile (
+        "ldr %d[aa], [%x[a_ptr]]\n"  // Load A[A].lower
+        "movi v5.4s, #0\n"
+        "ldr x20, [%x[a_ptr], #0x08]\n"  // Load A[A].upper
+        "movi v6.4s, #0\n"
+        "ldr %d[b0], [%x[b_ptr]]\n"  // Load B[0].lower
+        "ins %[aa].d[1], x20\n"  // Merge A[A].lower and upper
+        "movi v7.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #64]")
+        "movi v8.4s, #0\n"
+        "ldr x20, [%x[b_ptr], #0x08]\n"  // Load B[0].upper
+        "movi v9.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #64]")
+        "movi v10.4s, #0\n"
+        "ldr %d[b1], [%x[b_ptr], #0x10]\n"  // Load B[1].lower
+        "ins %[b0].d[1], x20\n"  // Merge B[0].lower and upper
+        "movi v11.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #96]")
+        "movi v12.4s, #0\n"
+        "movi v13.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #96]")
+        "movi v14.4s, #0\n"
+        "movi v15.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #128]")
+        "movi v16.4s, #0\n"
+        "movi v17.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #128]")
+        "movi v18.4s, #0\n"
+        "movi v19.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #160]")
+        "movi v20.4s, #0\n"
+        "movi v21.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #160]")
+        "movi v22.4s, #0\n"
+        "movi v23.4s, #0\n"
+        ASM_PREFETCH("[%[a_ptr], #192]")
+        "movi v24.4s, #0\n"
+        "add %x[a_ptr], %x[a_ptr], #0x10\n"
+        "movi v25.4s, #0\n"
+        ASM_PREFETCH("[%[b_ptr], #192]")
+        "movi v26.4s, #0\n"
+        "add %x[b_ptr], %x[b_ptr], #0x18\n"
+        "movi v27.4s, #0\n"
+        "movi v28.4s, #0\n"
 
-            __asm __volatile(
-                "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
-                "movi v5.4s, #0\n"
-                "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
-                "movi v6.4s, #0\n"
-                "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
-                "ins %[aa].d[1], x20\n"     // Merge A[A].lower and upper
-                "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi v8.4s, #0\n"
-                "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
-                "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi v10.4s, #0\n"
-                "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
-                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and upper
-                "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
-                "movi v12.4s, #0\n"
-                "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
-                "movi v14.4s, #0\n"
-                "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
-                "movi v16.4s, #0\n"
-                "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi v18.4s, #0\n"
-                "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
-                "movi v20.4s, #0\n"
-                "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
-                "movi v22.4s, #0\n"
-                "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
-                "movi v24.4s, #0\n"
-                "add %x[a_ptr], %x[a_ptr], #0x10\n"
-                "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi v26.4s, #0\n"
-                "add %x[b_ptr], %x[b_ptr], #0x18\n"
-                "movi v27.4s, #0\n"
-                "movi v28.4s, #0\n"
+        "cbz %x[k], 2f\n"  // Skip the loop if doing zero iterations.
 
-                "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+        "1:\n"  // Main loop
+          // First unroll
+          "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ldr %d[b0], [%x[b_ptr], #0x18]\n"  // Load B[0].lower
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "ldr x20, [%x[b_ptr], #0x20]\n"  // Load B[0].upper
+          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
 
-                "1:\n" // Main loop
-                // First unroll
-                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
-                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
-                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
-                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
-                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
-                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
-                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
-                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-                "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
-                "ins %[b2].d[1], x20\n"            // Merge B[2].lower and .upper
-                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-                "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
-                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          // Second unroll
+          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "ldr %d[aa], [%x[a_ptr], #0x10]\n"  // Load A[A].lower
+          "ins %[b0].d[1], x20\n"  // Merge B[0].lower and .upper
+          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "ldr x20, [%x[a_ptr], #0x18]\n"  // Load A[A].upper
+          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "add %x[a_ptr], %x[a_ptr], #0x20\n"
+          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          ASM_PREFETCH("[%[b_ptr], #320]")
+          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          ASM_PREFETCH("[%[a_ptr], #320]")
+          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          ASM_PREFETCH("[%[b_ptr], #448]")
+          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "subs %x[k], %x[k], #0x1\n"
+          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "ldr %d[b1], [%x[b_ptr], #0x28]\n"  // Load B[1].lower
+          "ins %[aa].d[1], x20\n"  // Merge A[A].lower and .upper
+          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "add %x[b_ptr], %x[b_ptr], #0x30\n"
+          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "bne 1b\n"
 
-                // Second unroll
-                "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-                "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
-                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and .upper
-                "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-                "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-                "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
-                "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-                "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-                "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-                "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-                "add %x[a_ptr], %x[a_ptr], #0x20\n"
-                "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-                "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
-                "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-                "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-                "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
-                "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-                "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-                "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-                "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-                "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-                "subs %x[k], %x[k], #0x1\n"
-                "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-                "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-                "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
-                "ins %[aa].d[1], x20\n"            // Merge A[A].lower and .upper
-                "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-                "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-                "add %x[b_ptr], %x[b_ptr], #0x30\n"
-                "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-                "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-                "bne 1b\n"
+        "2:\n"  // Even tail
+          "cbnz %x[odd_k], 3f\n"
 
-                "2:\n" // Even tail
-                "cbnz %x[odd_k], 3f\n"
+          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr]]\n"  // Load B[1].upper
+          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "ldr %d[ab], [%x[a_ptr]]\n"  // Load A[B].lower
+          "ins %[b1].d[1], x20\n"  // Merge B[1].lower and .upper
+          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "ldr x20, [%x[a_ptr], #0x8]\n"  // Load A[B].upper
+          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "ldr %d[b2], [%x[b_ptr], #0x8]\n"  // Load B[2].lower
+          "ins %[ab].d[1], x20\n"  // Merge A[B].lower and .upper
+          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "ldr x20, [%x[b_ptr], #0x10]\n"  // Load B[2].upper
+          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "add %[a_ptr], %[a_ptr], #0x10\n"
+          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "add %[b_ptr], %[b_ptr], #0x18\n"
+          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "ins %[b2].d[1], x20\n"  // Merge B[2].lower and .upper
+          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
 
-                "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
-                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
-                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
-                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
-                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
-                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
-                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
-                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-                "add %[a_ptr], %[a_ptr], #0x10\n"
-                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-                "add %[b_ptr], %[b_ptr], #0x18\n"
-                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-                "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
-                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+          "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+          "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+          "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+          "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
+          "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "b 4f\n"  // Complete write out
 
-                "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
-                "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
-                "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
-                "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
-                "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
-                "str q5, [%x[c_ptr]]\n"
-                "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
-                "str q13, [%x[c_ptr], #0x10]\n"
-                "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
-                "str q21, [%x[c_ptr], #0x20]\n"
-                "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
-                "str q6, [%x[c_ptr], #0x30]\n"
-                "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
-                "str q14, [%x[c_ptr], #0x40]\n"
-                "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
-                "str q22, [%x[c_ptr], #0x50]\n"
-                "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
-                "str q7, [%x[c_ptr], #0x60]\n"
-                "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
-                "str q15, [%x[c_ptr], #0x70]\n"
-                "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
-                "str q23, [%x[c_ptr], #0x80]\n"
-                "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
-                "str q8, [%x[c_ptr], #0x90]\n"
-                "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
-                "str q16, [%x[c_ptr], #0xa0]\n"
-                "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
-                "str q24, [%x[c_ptr], #0xb0]\n"
-                "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
-                "str q9, [%x[c_ptr], #0xc0]\n"
-                "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
-                "str q17, [%x[c_ptr], #0xd0]\n"
-                "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
-                "str q25, [%x[c_ptr], #0xe0]\n"
-                "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
-                "str q10, [%x[c_ptr], #0xf0]\n"
-                "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
-                "str q18, [%x[c_ptr], #0x100]\n"
-                "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
-                "str q26, [%x[c_ptr], #0x110]\n"
-                "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
-                "str q11, [%x[c_ptr], #0x120]\n"
-                "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
-                "str q19, [%x[c_ptr], #0x130]\n"
-                "b 4f\n" // Complete write out
+        "3:\n"  // Odd tail
+          "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+          "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+          "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+          "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+          "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+          "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+          "str q5, [%x[c_ptr]]\n"
+          "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+          "str q13, [%x[c_ptr], #0x10]\n"
+          "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+          "str q21, [%x[c_ptr], #0x20]\n"
+          "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+          "str q6, [%x[c_ptr], #0x30]\n"
+          "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+          "str q14, [%x[c_ptr], #0x40]\n"
+          "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+          "str q22, [%x[c_ptr], #0x50]\n"
+          "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+          "str q7, [%x[c_ptr], #0x60]\n"
+          "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+          "str q15, [%x[c_ptr], #0x70]\n"
+          "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+          "str q23, [%x[c_ptr], #0x80]\n"
+          "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+          "str q8, [%x[c_ptr], #0x90]\n"
+          "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+          "str q16, [%x[c_ptr], #0xa0]\n"
+          "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+          "str q24, [%x[c_ptr], #0xb0]\n"
+          "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+          "str q9, [%x[c_ptr], #0xc0]\n"
+          "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+          "str q17, [%x[c_ptr], #0xd0]\n"
+          "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+          "str q25, [%x[c_ptr], #0xe0]\n"
+          "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+          "str q10, [%x[c_ptr], #0xf0]\n"
+          "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+          "str q18, [%x[c_ptr], #0x100]\n"
+          "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+          "str q26, [%x[c_ptr], #0x110]\n"
+          "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+          "str q11, [%x[c_ptr], #0x120]\n"
 
-                "3:\n" // Odd tail
-                "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
-                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
-                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
-                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
-                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
-                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
-                "str q5, [%x[c_ptr]]\n"
-                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
-                "str q13, [%x[c_ptr], #0x10]\n"
-                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
-                "str q21, [%x[c_ptr], #0x20]\n"
-                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
-                "str q6, [%x[c_ptr], #0x30]\n"
-                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
-                "str q14, [%x[c_ptr], #0x40]\n"
-                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
-                "str q22, [%x[c_ptr], #0x50]\n"
-                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
-                "str q7, [%x[c_ptr], #0x60]\n"
-                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
-                "str q15, [%x[c_ptr], #0x70]\n"
-                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
-                "str q23, [%x[c_ptr], #0x80]\n"
-                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
-                "str q8, [%x[c_ptr], #0x90]\n"
-                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
-                "str q16, [%x[c_ptr], #0xa0]\n"
-                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
-                "str q24, [%x[c_ptr], #0xb0]\n"
-                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
-                "str q9, [%x[c_ptr], #0xc0]\n"
-                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
-                "str q17, [%x[c_ptr], #0xd0]\n"
-                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
-                "str q25, [%x[c_ptr], #0xe0]\n"
-                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
-                "str q10, [%x[c_ptr], #0xf0]\n"
-                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
-                "str q18, [%x[c_ptr], #0x100]\n"
-                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
-                "str q26, [%x[c_ptr], #0x110]\n"
-                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-                "str q11, [%x[c_ptr], #0x120]\n"
-
-                "4:\n" // End of function
-                "str q19, [%x[c_ptr], #0x130]\n"
-                "str q27, [%x[c_ptr], #0x140]\n"
-                "str q12, [%x[c_ptr], #0x150]\n"
-                "str q20, [%x[c_ptr], #0x160]\n"
-                "str q28, [%x[c_ptr], #0x170]\n"
-                "add %x[c_ptr], %x[c_ptr], #0x180\n"
-                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
-                [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
-                : [odd_k] "r"(odd_k)
-                : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
-        }
+        "4:\n"  // End of function
+          "str q19, [%x[c_ptr], #0x130]\n"
+          "str q27, [%x[c_ptr], #0x140]\n"
+          "str q12, [%x[c_ptr], #0x150]\n"
+          "str q20, [%x[c_ptr], #0x160]\n"
+          "str q28, [%x[c_ptr], #0x170]\n"
+          "add %x[c_ptr], %x[c_ptr], #0x180\n"
+        : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
+          [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
+        : [odd_k] "r" (odd_k)
+        : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
+      );
     }
+  }
 }
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index 26255b1..c67aed7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp

@@ -27,41 +27,38 @@
 
 #include "arm_gemm.hpp"
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Load the actual kernel
 void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
-class gemm_u8_12x8
-{
+class gemm_u8_12x8 {
 public:
-    typedef uint8_t  operand_type;
+    typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
     typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
     /* Describes the data layout for A input */
-    static const int  A_interleave = 8;
-    static const int  A_block      = 4;
-    static const bool A_transpose  = false;
+    static const int A_interleave = 8;
+    static const int A_block = 4;
+    static const bool A_transpose = false;
 
     /* Same for B input */
-    static const int  B_interleave = 12;
-    static const int  B_block      = 4;
-    static const bool B_transpose  = true;
+    static const int B_interleave = 12;
+    static const int B_block = 4;
+    static const bool B_transpose = true;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 12;
+    static const int out_width = 12;
     static const int out_height = 8;
-    static const int k_unroll   = 4;
+    static const int k_unroll = 4;
 
     kern_type kernel = a64_gemm_u8_12x8;
 
-    gemm_u8_12x8(const CPUInfo *ci)
-    {
-        if(ci->get_cpu_model() == CPUModel::A55r1)
-        {
+    gemm_u8_12x8(const CPUInfo *ci) {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
             kernel = a64_gemm_u8_12x8_a55r1;
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
index f8fafbd..994aea6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp

@@ -31,40 +31,37 @@
 #include "dot_toolchain_support.h"
 #endif
 
-namespace arm_gemm
-{
-void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K)
-{
+namespace arm_gemm {
+
+void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
     const uint8_t *a_ptr = Apanel;
-    uint32_t      *c_ptr = Cpanel;
+    uint32_t *c_ptr = Cpanel;
 
     // We divide K by 4 because the udot instruction processes 4 elements at a time.
-    const int W = K / 4;
+    const int W = K/4;
 
     // Fix up for odd lengths - set a flag if K is odd, but make
     // sure we round up the iteration count.
-    const int oddk    = (W & 1);
-    const int k_iters = ((W + 1) / 2) - 1;
+    const int oddk = (W & 1);
+    const int k_iters = ((W+1)/2) - 1;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr  = Bpanel;
+        const uint8_t *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             int k = k_iters;
 
-            register int32x4_t a0 asm("v0");
-            register int32x4_t a1 asm("v1");
-            register int32x4_t b0 asm("v2");
-            register int32x4_t b1 asm("v3");
-            register int32x4_t b2 asm("v4");
+            register int32x4_t a0  asm("v0");
+            register int32x4_t a1  asm("v1");
+            register int32x4_t b0  asm("v2");
+            register int32x4_t b1  asm("v3");
+            register int32x4_t b2  asm("v4");
             register int32x4_t a0a asm("v5");
             register int32x4_t a1a asm("v6");
 
-            __asm __volatile(
+            __asm __volatile (
 #ifdef NO_DOT_IN_TOOLCHAIN
                 _DECLARE_UDOT
 #else
@@ -79,22 +76,39 @@
                 "ldr    %q[a1], [%[a_ptr], #16]\n"
                 "movi   v11.4s, #0x0\n"
                 "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi   v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi   v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi   v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi   v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
                 "movi   v18.4s, #0x0\n"
-                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
                 "movi   v20.4s, #0x0\n"
-                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
                 "movi   v22.4s, #0x0\n"
-                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v23.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
                 "movi   v24.4s, #0x0\n"
-                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
                 "movi   v26.4s, #0x0\n"
-                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
                 "movi   v28.4s, #0x0\n"
-                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #384]")
                 "movi   v30.4s, #0x0\n"
-                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "movi   v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
 
                 // The loop is offset by these two instructions which must
                 // always be executed.
@@ -105,102 +119,105 @@
                 "cbz    %w[k], 4f\n"
 
                 "1:\n"
-                "udot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
 
-                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
-                "udot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #40]\n"
-                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "ins    %[a0a].d[1], x20\n"
-                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #56]\n"
-                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
                 "ins    %[a1a].d[1], x20\n"
-                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #56]\n"
-                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
                 "ins    %[b0].d[1], x20\n"
-                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #72]\n"
-                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #448]")
 
-                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
-                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #576]")
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
 
-                // Unroll 1
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+		// Unroll 1
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
                 "ins    %[b1].d[1], x20\n"
-                "udot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #88]\n"
-                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
 
-                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
                 "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #72]\n"
-                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
                 "ins    %[a0].d[1], x20\n"
-                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #88]\n"
-                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
 
-                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
                 "ins    %[a1].d[1], x20\n"
-                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #104]\n"
-                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
 
-                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
                 "ins    %[b0].d[1], x20\n"
-                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #120]\n"
-                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
 
-                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
-                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                ASM_PREFETCH("[%[b_ptr], #640]")
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
                 "ins    %[b1].d[1], x20\n"
-                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
                 "ldr    %d[b2], [%[b_ptr], #32]\n"
 
                 "udot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "b.ne    1b\n"
+                "b.ne	1b\n"
 
                 // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
                 "4:\n"
@@ -212,71 +229,83 @@
                 "cbnz   %w[oddk], 2f\n"
 
                 // Even K continuation
-                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
 
-                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
                 "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #40]\n"
-                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
-                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "ins    %[a0a].d[1], x20\n"
-                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
                 "ldr    x20, [%[a_ptr], #56]\n"
-                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
                 "ins    %[a1a].d[1], x20\n"
-                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #56]\n"
-                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
-                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
 
-                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
-                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
                 "ins    %[b0].d[1], x20\n"
-                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #72]\n"
-                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
-                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
                 "ins    %[b1].d[1], x20\n"
                 "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
                 "ldr    x20, [%[b_ptr], #88]\n"
-                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
                 "ins    %[b2].d[1], x20\n"
 
-                "udot   v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "udot   v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
                 "udot   v12.4s, %[b0].16b, %[a1a].4b[0]\n"
                 "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "udot   v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "udot   v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
                 "udot   v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "udot   v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "udot   v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
                 "udot   v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "udot   v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "udot   v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
                 "udot   v19.4s, %[b1].16b, %[a0a].4b[3]\n"
                 "udot   v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "udot   v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "udot   v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
                 "udot   v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot   v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "udot   v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
                 "udot   v24.4s, %[b2].16b, %[a0a].4b[0]\n"
                 "udot   v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "udot   v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "udot   v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
                 "udot   v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "udot   v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "udot   v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
                 "udot   v29.4s, %[b2].16b, %[a1a].4b[1]\n"
                 "add    %[a_ptr], %[a_ptr], #64\n"
                 "udot   v30.4s, %[b2].16b, %[a1a].4b[2]\n"
@@ -286,27 +315,41 @@
 
                 // Odd K continuation
                 "2:\n"
-                "udot   v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "udot   v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
                 "udot   v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "ins    %[b2].d[1], x20\n"
-                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
                 "udot   v14.4s, %[b0].16b, %[a1].4b[2]\n"
                 "add    %[a_ptr], %[a_ptr], #32\n"
-                "udot   v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "udot   v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
                 "udot   v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "add    %[b_ptr], %[b_ptr], #48\n"
-                "udot   v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "udot   v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
                 "udot   v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot   v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "udot   v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
                 "udot   v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot   v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "udot   v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
                 "udot   v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot   v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "udot   v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
                 "udot   v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot   v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "udot   v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
                 "udot   v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot   v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "udot   v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "udot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]") "udot   v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "udot   v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "udot   v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "udot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "udot   v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
                 "udot   v31.4s, %[b2].16b, %[a1].4b[3]\n"
 
                 // Common tail
@@ -340,13 +383,15 @@
 #ifdef NO_DOT_IN_TOOLCHAIN
                 ".purgem udot\n"
 #endif
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
index 5ee273b..b05e899 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h

@@ -22,45 +22,46 @@
  * SOFTWARE.
  */
 
+
 // Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_UDOT                                                                                  \
-    ".altmacro\n"                                                                                      \
-    ".macro udot opd:req, opn:req, opm:req\n"                                                          \
-    "local vd, vn, vm, h, l\n"                                                                         \
-    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
-    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                                               \
-    ".set vd,\\reg\n"                                                                                  \
-    ".endif\n"                                                                                         \
-    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                                              \
-    ".set vn,\\reg\n"                                                                                  \
-    ".endif\n"                                                                                         \
-    ".irp idx,0,1,2,3\n"                                                                               \
-    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                                      \
-    ".set vm,\\reg\n"                                                                                  \
-    ".set h,\\idx / 2\n"                                                                               \
-    ".set l,\\idx %% 2\n"                                                                              \
-    ".endif\n"                                                                                         \
-    ".endr\n"                                                                                          \
-    ".endr\n"                                                                                          \
-    ".ifndef vd\n"                                                                                     \
-    ".error \"Bad operand \\opd\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef vn\n"                                                                                     \
-    ".error \"Bad operand \\opn\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef vm\n"                                                                                     \
-    ".error \"Bad operand \\opm\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef h\n"                                                                                      \
-    ".error \"Bad operand \\opm\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".ifndef l\n"                                                                                      \
-    ".error \"Bad operand \\opm\"\n"                                                                   \
-    ".exitm\n"                                                                                         \
-    ".endif\n"                                                                                         \
-    ".int     0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"                      \
-    ".endm\n"
+#define _DECLARE_UDOT ".altmacro\n"\
+    ".macro udot opd:req, opn:req, opm:req\n"\
+    "local vd, vn, vm, h, l\n"\
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
+    ".set vd,\\reg\n"\
+    ".endif\n"\
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
+    ".set vn,\\reg\n"\
+    ".endif\n"\
+    ".irp idx,0,1,2,3\n"\
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
+    ".set vm,\\reg\n"\
+    ".set h,\\idx / 2\n"\
+    ".set l,\\idx %% 2\n"\
+    ".endif\n"\
+    ".endr\n"\
+    ".endr\n"\
+    ".ifndef vd\n"\
+    ".error \"Bad operand \\opd\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vn\n"\
+    ".error \"Bad operand \\opn\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef vm\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef h\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".ifndef l\n"\
+    ".error \"Bad operand \\opm\"\n"\
+    ".exitm\n"\
+    ".endif\n"\
+    ".int	 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
+    ".endm\n"\
+

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
index d026dc5..80dd873 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp

@@ -31,309 +31,328 @@
 #include "dot_toolchain_support.h"
 #endif
 
-namespace arm_gemm
-{
-void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
-    uint32_t      *c_ptr = Cpanel;
+    uint32_t *c_ptr = Cpanel;
     // We divide K by 4 because the udot instruction processes 4 elements at a time.
-    const int W = K / 4;
+    const int W = K/4;
     // Fix up for odd lengths - set a flag if K is odd, but make
     // sure we round up the iteration count.
-    const int oddk         = (W & 1);
-    const int init_value_k = ((W + 1) / 2) - 1;
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
         const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr  = Bpanel;
-        for(int xb = 0; xb < bblocks; xb++)
-        {
-            a_ptr                 = a_ptr0;
-            int                 k = init_value_k;
-            register uint8x16_t a0 asm("v0");
-            register uint8x16_t a1 asm("v1");
-            register uint8x16_t b0 asm("v2");
-            register uint8x16_t b1 asm("v3");
-            register uint8x16_t b2 asm("v4");
+        const uint8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register uint8x16_t a0  asm("v0");
+            register uint8x16_t a1  asm("v1");
+            register uint8x16_t b0  asm("v2");
+            register uint8x16_t b1  asm("v3");
+            register uint8x16_t b2  asm("v4");
             register uint8x16_t a0a asm("v5");
             register uint8x16_t a1a asm("v6");
-            __asm __volatile(
+            __asm __volatile (
 #ifdef NO_DOT_IN_TOOLCHAIN
                 _DECLARE_UDOT
 #else
                 ".arch  armv8.2-a+dotprod\n"
 #endif
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.4s, #0x0\n"
-                "ldr    %q[a0], [%[a_ptr]]\n"
-                "movi    v9.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.4s, #0x0\n"
-                "ldr    %q[a1], [%[a_ptr], #16]\n"
-                "movi    v11.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
                 ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi    v22.4s, #0x0\n"
-                "movi    v23.4s, #0x0\n"
-                "movi    v24.4s, #0x0\n"
-                "movi    v25.4s, #0x0\n"
-                "movi    v26.4s, #0x0\n"
-                "movi    v27.4s, #0x0\n"
-                "movi    v28.4s, #0x0\n"
-                "movi    v29.4s, #0x0\n"
-                "movi    v30.4s, #0x0\n"
-                "movi    v31.4s, #0x0\n"
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 // Loop proper
                 "1:\n"
-                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "udot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot  	v9.4s , %[b0].16b, %[a0].4b[1]\n"
 
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %q[a0a], [%[a_ptr], #32]\n"
-                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "udot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    %q[a1a], [%[a_ptr], #48]\n"
-                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #48]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot	v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #64]\n"
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
 
-                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
-                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr    %q[b2], [%[b_ptr], #80]\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
 
-                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-                "udot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "ldr    %q[a0], [%[a_ptr], #64]\n"
-                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot	v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
                 "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #96]\n"
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
-                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #112]\n"
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
 
-                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "bne    1b\n"
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne	1b\n"
 
                 // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
                 "4:\n"
 
                 // Branch to alternative tail for odd K
-                "cbnz    %w[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration (even K)
-                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
                 "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "ldr    %q[a0a], [%[a_ptr], #32]\n"
-                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
                 "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "ldr    %q[a1a], [%[a_ptr], #48]\n"
-                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #48]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
-                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #64]\n"
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
 
-                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "ldr    %q[b2], [%[b_ptr], #80]\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
 
-                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot	v8.4s , %[b0].16b, %[a0a].4b[0]\n"
 
-                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot	v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
                 "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
-                "str    q8, [%[c_ptr], #0]\n"
-                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
-                "str    q24, [%[c_ptr], #32]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "udot	v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
 
-                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
-                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
+                "udot	v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "udot	v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "udot	v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "udot	v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
 
-                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
+                "udot	v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "udot	v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "udot	v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
 
-                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
-                "str    q12, [%[c_ptr], #192]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "udot	v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "udot	v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
 
                 "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
-                "str    q13, [%[c_ptr], #240]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "udot	v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "udot	v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
 
-                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
-                "str    q14, [%[c_ptr], #288]\n"
+                "udot	v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "udot	v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "udot	v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
 
-                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
-                "str    q15, [%[c_ptr], #336]\n"
+                "udot	v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "udot	v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "udot	v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
 
-                "b    3f\n"
+                "b	3f\n"
 
                 // Detached final iteration (odd K)
                 "2:\n"
-                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot	v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "udot	v16.4s, %[b1].16b, %[a0].4b[0]\n"
                 "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
-                "str    q8, [%[c_ptr], #0]\n"
-                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "str    q24, [%[c_ptr], #32]\n"
-                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "udot	v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "udot	v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "udot	v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
 
-                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
+                "udot	v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "udot	v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "udot	v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
 
-                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
+                "udot	v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "udot	v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "udot	v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
 
-                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
-                "str    q12, [%[c_ptr], #192]\n"
+                "udot 	v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "udot	v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "udot	v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
 
                 "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
-                "str    q13, [%[c_ptr], #240]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "udot	v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "udot	v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
 
-                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
-                "str    q14, [%[c_ptr], #288]\n"
+                "udot	v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "udot	v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "udot	v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
 
-                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
-                "str    q15, [%[c_ptr], #336]\n"
+                "udot	v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "udot	v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "udot	v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
 
                 // Common tail
                 "3:\n"
-                "str    q23, [%[c_ptr], #352]\n"
-                "str    q31, [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
 
 #ifdef NO_DOT_IN_TOOLCHAIN
                 ".purgem udot\n"
 #endif
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 5aa5291..23f4c1d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp

@@ -25,38 +25,36 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Kernel definition
 void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
 
-class gemm_u8_4x4
-{
+class gemm_u8_4x4 {
 public:
-    typedef uint8_t  operand_type;
+    typedef uint8_t operand_type;
     typedef uint32_t result_type;
 
     typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
     /* Describes the data layout for A input */
-    static const int  A_interleave = 4;
-    static const int  A_block      = 16;
-    static const bool A_transpose  = false;
+    static const int A_interleave = 4;
+    static const int A_block = 16;
+    static const bool A_transpose = false;
 
     /* Same for B input */
-    static const int  B_interleave = 4;
-    static const int  B_block      = 16;
-    static const bool B_transpose  = true;
+    static const int B_interleave = 4;
+    static const int B_block = 16;
+    static const bool B_transpose = true;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 4;
+    static const int out_width = 4;
     static const int out_height = 4;
-    static const int k_unroll   = 16;
+    static const int k_unroll = 16;
 
     kern_type kernel = nullptr;
 
-    gemm_u8_4x4(const CPUInfo *ci)
-    {
+    gemm_u8_4x4(const CPUInfo *ci) {
         kernel = a64_gemm_u8_4x4;
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
index 0a881ff..2e60833 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,243 +27,255 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
-void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
     const uint8_t *a_ptr = Apanel;
-    uint32_t      *c_ptr = Cpanel;
+    uint32_t *c_ptr = Cpanel;
     K /= 16;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const uint8_t *a_ptr0 = a_ptr;
-        const uint8_t *b_ptr  = Bpanel;
+        const uint8_t *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
 
-            int k = K - 1;
+            int k = K-1;
 
-            register uint8x16_t b0 asm("v4");
-            register uint8x16_t b1 asm("v5");
-            register uint8x16_t b2 asm("v6");
-            register uint8x16_t b3 asm("v7");
+            register uint8x16_t b0  asm("v4");
+            register uint8x16_t b1  asm("v5");
+            register uint8x16_t b2  asm("v6");
+            register uint8x16_t b3  asm("v7");
 
-            __asm __volatile(
-                "movi    v16.4s, #0x0\n"
-                "ldr    q0, [%[a_ptr]]\n"
-                "movi    v17.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v18.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v19.4s, #0x0\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "movi    v20.4s, #0x0\n"
-                "ldr    %q[b3], [%[b_ptr], #48]\n"
-                "movi    v21.4s, #0x0\n"
-                "ldr    q1, [%[a_ptr], #16]\n"
-                "movi    v22.4s, #0x0\n"
-                "ldr    q2, [%[a_ptr], #32]\n"
-                "movi    v23.4s, #0x0\n"
-                "ldr    q3, [%[a_ptr], #48]\n"
-                "movi    v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v27.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi    v30.4s, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #256]") "movi    v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+            __asm __volatile (
+                "movi	v16.4s, #0x0\n"
+                "ldr	q0, [%[a_ptr]]\n"
+                "movi	v17.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v18.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v19.4s, #0x0\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "movi	v20.4s, #0x0\n"
+                "ldr	%q[b3], [%[b_ptr], #48]\n"
+                "movi	v21.4s, #0x0\n"
+                "ldr	q1, [%[a_ptr], #16]\n"
+                "movi	v22.4s, #0x0\n"
+                "ldr	q2, [%[a_ptr], #32]\n"
+                "movi	v23.4s, #0x0\n"
+                "ldr	q3, [%[a_ptr], #48]\n"
+                "movi	v24.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v26.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v28.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
 
-                "umull    v12.8h, v0.8b, %[b0].8b\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "umull    v13.8h, v0.8b, %[b1].8b\n"
-                "umull    v14.8h, v0.8b, %[b2].8b\n"
-                "add    %[b_ptr], %[b_ptr], #64\n"
-                "umull    v15.8h, v0.8b, %[b3].8b\n"
+                "umull	v12.8h, v0.8b, %[b0].8b\n"
+		"add	%[a_ptr], %[a_ptr], #64\n"
+                "umull	v13.8h, v0.8b, %[b1].8b\n"
+                "umull	v14.8h, v0.8b, %[b2].8b\n"
+		"add	%[b_ptr], %[b_ptr], #64\n"
+                "umull	v15.8h, v0.8b, %[b3].8b\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 2f\n"
+                "cbz	%w[k], 2f\n"
 
                 "1:\n"
-                "uadalp    v16.4s, v12.8h\n"
-                "umull2    v12.8h, v0.16b, %[b0].16b\n"
-                "uadalp    v17.4s, v13.8h\n"
-                "umull2    v13.8h, v0.16b, %[b1].16b\n"
-                "uadalp    v18.4s, v14.8h\n"
-                "umull2    v14.8h, v0.16b, %[b2].16b\n"
-                "uadalp    v19.4s, v15.8h\n"
-                "umull2    v15.8h, v0.16b, %[b3].16b\n"
-                "ldr     q0, [%[a_ptr]]\n"
+                "uadalp	v16.4s, v12.8h\n"
+                "umull2	v12.8h, v0.16b, %[b0].16b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "umull2	v13.8h, v0.16b, %[b1].16b\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull2	v14.8h, v0.16b, %[b2].16b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull2	v15.8h, v0.16b, %[b3].16b\n"
+                "ldr 	q0, [%[a_ptr]]\n"
 
-                "uadalp    v16.4s, v12.8h\n"
-                "umull    v12.8h, v1.8b, %[b0].8b\n"
-                "uadalp    v17.4s, v13.8h\n"
-                "umull    v13.8h, v1.8b, %[b1].8b\n"
-                "subs    %w[k], %w[k], #1\n"
-                "uadalp    v18.4s, v14.8h\n"
-                "umull    v14.8h, v1.8b, %[b2].8b\n"
-                "uadalp    v19.4s, v15.8h\n"
-                "umull    v15.8h, v1.8b, %[b3].8b\n"
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "umull	v13.8h, v1.8b, %[b1].8b\n"
+                "subs	%w[k], %w[k], #1\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v14.8h, v1.8b, %[b2].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull	v15.8h, v1.8b, %[b3].8b\n"
 
-                "uadalp    v20.4s, v12.8h\n"
-                "umull2    v12.8h, v1.16b, %[b0].16b\n"
-                "uadalp    v21.4s, v13.8h\n"
-                "umull2    v13.8h, v1.16b, %[b1].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
-                "uadalp    v22.4s, v14.8h\n"
-                "umull2    v14.8h, v1.16b, %[b2].16b\n"
-                "uadalp    v23.4s, v15.8h\n"
-                "umull2    v15.8h, v1.16b, %[b3].16b\n"
-                "ldr     q1, [%[a_ptr], #16]\n"
+                "uadalp	v20.4s, v12.8h\n"
+                "umull2	v12.8h, v1.16b, %[b0].16b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "umull2	v13.8h, v1.16b, %[b1].16b\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "uadalp	v22.4s, v14.8h\n"
+                "umull2	v14.8h, v1.16b, %[b2].16b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull2	v15.8h, v1.16b, %[b3].16b\n"
+                "ldr 	q1, [%[a_ptr], #16]\n"
 
-                "uadalp    v20.4s, v12.8h\n"
-                "umull    v12.8h, v2.8b, %[b0].8b\n"
-                "uadalp    v21.4s, v13.8h\n"
-                "umull    v13.8h, v2.8b, %[b1].8b\n" ASM_PREFETCH("[%[b_ptr], #256]")
-                "uadalp    v22.4s, v14.8h\n"
-                "umull    v14.8h, v2.8b, %[b2].8b\n"
-                "uadalp    v23.4s, v15.8h\n"
-                "umull    v15.8h, v2.8b, %[b3].8b\n"
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "umull	v13.8h, v2.8b, %[b1].8b\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v14.8h, v2.8b, %[b2].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull	v15.8h, v2.8b, %[b3].8b\n"
 
-                "uadalp    v24.4s, v12.8h\n"
-                "umull2    v12.8h, v2.16b, %[b0].16b\n"
-                "uadalp    v25.4s, v13.8h\n"
-                "umull2    v13.8h, v2.16b, %[b1].16b\n"
-                "uadalp    v26.4s, v14.8h\n"
-                "umull2    v14.8h, v2.16b, %[b2].16b\n"
-                "uadalp    v27.4s, v15.8h\n"
-                "umull2    v15.8h, v2.16b, %[b3].16b\n"
-                "ldr    q2, [%[a_ptr], #32]\n"
+                "uadalp	v24.4s, v12.8h\n"
+                "umull2	v12.8h, v2.16b, %[b0].16b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "umull2	v13.8h, v2.16b, %[b1].16b\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull2	v14.8h, v2.16b, %[b2].16b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull2	v15.8h, v2.16b, %[b3].16b\n"
+                "ldr	q2, [%[a_ptr], #32]\n"
 
-                "uadalp    v24.4s, v12.8h\n"
-                "umull    v12.8h, v3.8b, %[b0].8b\n"
-                "uadalp    v25.4s, v13.8h\n"
-                "umull    v13.8h, v3.8b, %[b1].8b\n"
-                "uadalp    v26.4s, v14.8h\n"
-                "umull    v14.8h, v3.8b, %[b2].8b\n"
-                "uadalp    v27.4s, v15.8h\n"
-                "umull    v15.8h, v3.8b, %[b3].8b\n"
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "umull	v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v14.8h, v3.8b, %[b2].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull	v15.8h, v3.8b, %[b3].8b\n"
 
-                "uadalp    v28.4s, v12.8h\n"
-                "umull2    v12.8h, v3.16b, %[b0].16b\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "uadalp    v29.4s, v13.8h\n"
-                "umull2    v13.8h, v3.16b, %[b1].16b\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "uadalp    v30.4s, v14.8h\n"
-                "umull2    v14.8h, v3.16b, %[b2].16b\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "uadalp    v31.4s, v15.8h\n"
-                "umull2    v15.8h, v3.16b, %[b3].16b\n"
-                "ldr    %q[b3], [%[b_ptr], #48]\n"
+                "uadalp	v28.4s, v12.8h\n"
+                "umull2	v12.8h, v3.16b, %[b0].16b\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "umull2	v13.8h, v3.16b, %[b1].16b\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "umull2	v14.8h, v3.16b, %[b2].16b\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "uadalp	v31.4s, v15.8h\n"
+                "umull2	v15.8h, v3.16b, %[b3].16b\n"
+                "ldr	%q[b3], [%[b_ptr], #48]\n"
 
-                "uadalp    v28.4s, v12.8h\n"
-                "umull    v12.8h, v0.8b, %[b0].8b\n"
-                "add    %[b_ptr], %[b_ptr], #64\n"
-                "uadalp    v29.4s, v13.8h\n"
-                "umull    v13.8h, v0.8b, %[b1].8b\n"
-                "ldr    q3, [%[a_ptr], #48]\n"
-                "uadalp    v30.4s, v14.8h\n"
-                "umull    v14.8h, v0.8b, %[b2].8b\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "uadalp    v31.4s, v15.8h\n"
-                "umull    v15.8h, v0.8b, %[b3].8b\n"
-                "bne    1b\n"
+                "uadalp	v28.4s, v12.8h\n"
+                "umull	v12.8h, v0.8b, %[b0].8b\n"
+                "add	%[b_ptr], %[b_ptr], #64\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "umull	v13.8h, v0.8b, %[b1].8b\n"
+                "ldr	q3, [%[a_ptr], #48]\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "umull	v14.8h, v0.8b, %[b2].8b\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "uadalp	v31.4s, v15.8h\n"
+                "umull	v15.8h, v0.8b, %[b3].8b\n"
+                "bne	1b\n"
 
                 // Branch target
                 "2:\n"
-                "uadalp    v16.4s, v12.8h\n"
-                "umull2    v12.8h, v0.16b, %[b0].16b\n"
-                "uadalp    v17.4s, v13.8h\n"
-                "umull2    v13.8h, v0.16b, %[b1].16b\n"
-                "uadalp    v18.4s, v14.8h\n"
-                "umull2    v14.8h, v0.16b, %[b2].16b\n"
-                "uadalp    v19.4s, v15.8h\n"
-                "umull2    v15.8h, v0.16b, %[b3].16b\n"
+                "uadalp	v16.4s, v12.8h\n"
+                "umull2	v12.8h, v0.16b, %[b0].16b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "umull2	v13.8h, v0.16b, %[b1].16b\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull2	v14.8h, v0.16b, %[b2].16b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull2	v15.8h, v0.16b, %[b3].16b\n"
 
-                "uadalp    v16.4s, v12.8h\n"
-                "umull    v12.8h, v1.8b, %[b0].8b\n"
-                "uadalp    v17.4s, v13.8h\n"
-                "umull    v13.8h, v1.8b, %[b1].8b\n"
-                "uadalp    v18.4s, v14.8h\n"
-                "umull    v14.8h, v1.8b, %[b2].8b\n"
-                "uadalp    v19.4s, v15.8h\n"
-                "umull    v15.8h, v1.8b, %[b3].8b\n"
+                "uadalp	v16.4s, v12.8h\n"
+                "umull	v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp	v17.4s, v13.8h\n"
+                "umull	v13.8h, v1.8b, %[b1].8b\n"
+                "uadalp	v18.4s, v14.8h\n"
+                "umull	v14.8h, v1.8b, %[b2].8b\n"
+                "uadalp	v19.4s, v15.8h\n"
+                "umull	v15.8h, v1.8b, %[b3].8b\n"
 
-                "uadalp    v20.4s, v12.8h\n"
-                "umull2    v12.8h, v1.16b, %[b0].16b\n"
-                "uadalp    v21.4s, v13.8h\n"
-                "umull2    v13.8h, v1.16b, %[b1].16b\n"
-                "uadalp    v22.4s, v14.8h\n"
-                "umull2    v14.8h, v1.16b, %[b2].16b\n"
-                "uadalp    v23.4s, v15.8h\n"
-                "umull2    v15.8h, v1.16b, %[b3].16b\n"
+                "uadalp	v20.4s, v12.8h\n"
+                "umull2	v12.8h, v1.16b, %[b0].16b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "umull2	v13.8h, v1.16b, %[b1].16b\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull2	v14.8h, v1.16b, %[b2].16b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull2	v15.8h, v1.16b, %[b3].16b\n"
 
-                "uadalp    v20.4s, v12.8h\n"
-                "umull    v12.8h, v2.8b, %[b0].8b\n"
-                "uadalp    v21.4s, v13.8h\n"
-                "umull    v13.8h, v2.8b, %[b1].8b\n"
-                "uadalp    v22.4s, v14.8h\n"
-                "umull    v14.8h, v2.8b, %[b2].8b\n"
-                "uadalp    v23.4s, v15.8h\n"
-                "umull    v15.8h, v2.8b, %[b3].8b\n"
+                "uadalp	v20.4s, v12.8h\n"
+                "umull	v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp	v21.4s, v13.8h\n"
+                "umull	v13.8h, v2.8b, %[b1].8b\n"
+                "uadalp	v22.4s, v14.8h\n"
+                "umull	v14.8h, v2.8b, %[b2].8b\n"
+                "uadalp	v23.4s, v15.8h\n"
+                "umull	v15.8h, v2.8b, %[b3].8b\n"
 
-                "uadalp    v24.4s, v12.8h\n"
-                "umull2    v12.8h, v2.16b, %[b0].16b\n"
-                "uadalp    v25.4s, v13.8h\n"
-                "umull2    v13.8h, v2.16b, %[b1].16b\n"
-                "uadalp    v26.4s, v14.8h\n"
-                "umull2    v14.8h, v2.16b, %[b2].16b\n"
-                "uadalp    v27.4s, v15.8h\n"
-                "umull2    v15.8h, v2.16b, %[b3].16b\n"
+                "uadalp	v24.4s, v12.8h\n"
+                "umull2	v12.8h, v2.16b, %[b0].16b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "umull2	v13.8h, v2.16b, %[b1].16b\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull2	v14.8h, v2.16b, %[b2].16b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull2	v15.8h, v2.16b, %[b3].16b\n"
 
-                "uadalp    v24.4s, v12.8h\n"
-                "umull    v12.8h, v3.8b, %[b0].8b\n"
-                "uadalp    v25.4s, v13.8h\n"
-                "umull    v13.8h, v3.8b, %[b1].8b\n"
-                "uadalp    v26.4s, v14.8h\n"
-                "umull    v14.8h, v3.8b, %[b2].8b\n"
-                "uadalp    v27.4s, v15.8h\n"
-                "umull    v15.8h, v3.8b, %[b3].8b\n"
+                "uadalp	v24.4s, v12.8h\n"
+                "umull	v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp	v25.4s, v13.8h\n"
+                "umull	v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp	v26.4s, v14.8h\n"
+                "umull	v14.8h, v3.8b, %[b2].8b\n"
+                "uadalp	v27.4s, v15.8h\n"
+                "umull	v15.8h, v3.8b, %[b3].8b\n"
 
-                "uadalp    v28.4s, v12.8h\n"
-                "umull2    v12.8h, v3.16b, %[b0].16b\n"
-                "uadalp    v29.4s, v13.8h\n"
-                "umull2    v13.8h, v3.16b, %[b1].16b\n"
-                "uadalp    v30.4s, v14.8h\n"
-                "umull2    v14.8h, v3.16b, %[b2].16b\n"
-                "uadalp    v31.4s, v15.8h\n"
-                "umull2    v15.8h, v3.16b, %[b3].16b\n"
+                "uadalp	v28.4s, v12.8h\n"
+                "umull2	v12.8h, v3.16b, %[b0].16b\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "umull2	v13.8h, v3.16b, %[b1].16b\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "umull2	v14.8h, v3.16b, %[b2].16b\n"
+                "uadalp	v31.4s, v15.8h\n"
+                "umull2	v15.8h, v3.16b, %[b3].16b\n"
 
-                "uadalp    v28.4s, v12.8h\n"
-                "uadalp    v29.4s, v13.8h\n"
-                "uadalp    v30.4s, v14.8h\n"
-                "uadalp    v31.4s, v15.8h\n"
+                "uadalp	v28.4s, v12.8h\n"
+                "uadalp	v29.4s, v13.8h\n"
+                "uadalp	v30.4s, v14.8h\n"
+                "uadalp	v31.4s, v15.8h\n"
 
-                "addp    v16.4s, v16.4s, v17.4s\n"
-                "addp    v17.4s, v18.4s, v19.4s\n"
-                "addp    v18.4s, v20.4s, v21.4s\n"
-                "addp    v19.4s, v22.4s, v23.4s\n"
-                "addp    v20.4s, v24.4s, v25.4s\n"
-                "addp    v21.4s, v26.4s, v27.4s\n"
-                "addp    v22.4s, v28.4s, v29.4s\n"
-                "addp    v23.4s, v30.4s, v31.4s\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
+                "addp	v20.4s, v24.4s, v25.4s\n"
+                "addp	v21.4s, v26.4s, v27.4s\n"
+                "addp	v22.4s, v28.4s, v29.4s\n"
+                "addp	v23.4s, v30.4s, v31.4s\n"
 
-                "addp    v16.4s, v16.4s, v17.4s\n"
-                "addp    v17.4s, v18.4s, v19.4s\n"
-                "addp    v18.4s, v20.4s, v21.4s\n"
-                "addp    v19.4s, v22.4s, v23.4s\n"
+                "addp	v16.4s, v16.4s, v17.4s\n"
+                "addp	v17.4s, v18.4s, v19.4s\n"
+                "addp	v18.4s, v20.4s, v21.4s\n"
+                "addp	v19.4s, v22.4s, v23.4s\n"
 
-                "str    q16, [%[c_ptr]]\n"
-                "str    q17, [%[c_ptr], #16]\n"
-                "str    q18, [%[c_ptr], #32]\n"
-                "str    q19, [%[c_ptr], #48]\n"
-                "add    %[c_ptr], %[c_ptr], #64\n"
+                "str	q16, [%[c_ptr]]\n"
+                "str	q17, [%[c_ptr], #16]\n"
+                "str	q18, [%[c_ptr], #32]\n"
+                "str	q19, [%[c_ptr], #48]\n"
+                "add	%[c_ptr], %[c_ptr], #64\n"
 
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
-                [k] "+r"(k)
-                :
-                : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-                "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
+              [k] "+r" (k)
+            :
+            : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
+              "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 5fc0a7b..fe74b99 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp

@@ -27,8 +27,8 @@
 
 #include "arm_gemm.hpp"
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
@@ -37,33 +37,30 @@
 //
 // The generic "gemm_opt" function will instantiate one of these (allowing
 // the constructor to pick a kernel implementation).
-class hgemm_24x8
-{
+class hgemm_24x8 {
 public:
     typedef __fp16 operand_type;
     typedef __fp16 result_type;
 
     typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
-    static const int  A_block      = 1;
-    static const int  A_interleave = 8;
-    static const bool A_transpose  = false;
+    static const int A_block = 1;
+    static const int A_interleave = 8;
+    static const bool A_transpose = false;
 
-    static const int  B_block      = 1;
-    static const int  B_interleave = 24;
-    static const bool B_transpose  = true;
+    static const int B_block = 1;
+    static const int B_interleave = 24;
+    static const bool B_transpose = true;
 
-    static const int out_width  = 24;
+    static const int out_width = 24;
     static const int out_height = 8;
-    static const int k_unroll   = 1;
+    static const int k_unroll = 1;
 
     // Default to the generic kernel
     kern_type kernel = a64_hgemm_asimd_24x8;
 
-    hgemm_24x8(const CPUInfo *ci)
-    {
-        if(ci->get_cpu_model() == CPUModel::A55r1)
-        {
+    hgemm_24x8(const CPUInfo *ci) {
+        if (ci->get_cpu_model() == CPUModel::A55r1) {
             kernel = a64_hgemm_asimd_24x8_a55r1;
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
index 2186117..a3839ce 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp

@@ -39,25 +39,22 @@
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
-namespace arm_gemm
-{
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
-    __fp16       *c_ptr = Cpanel;
+    __fp16 *c_ptr = Cpanel;
 
     // Fix up for odd lengths - set a flag if K is odd, but make
     // sure we round up the iteration count.
-    int oddk    = (K & 1);
-    int k_iters = ((K + 1) / 2) - 1;
+    int oddk = (K & 1);
+    int k_iters = ((K+1)/2) - 1;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr  = Bpanel;
+        const __fp16 *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             int k = k_iters;
             a_ptr = a_ptr0;
 
@@ -65,294 +62,333 @@
             // "A" operands to save on "ins" instructions.  Since A55 is
             // in-order, two sets of "A" operands and one set of "B" is
             // sufficient.
-            register float16x8_t a0 asm("v0");
-            register float16x8_t a1 asm("v1");
+            register float16x8_t a0  asm("v0");
+            register float16x8_t a1  asm("v1");
             register float16x8_t a0a asm("v2");
             register float16x8_t a1a asm("v3");
-            register float16x8_t b0 asm("v4");
-            register float16x8_t b1 asm("v5");
-            register float16x8_t b2 asm("v6");
+            register float16x8_t b0  asm("v4");
+            register float16x8_t b1  asm("v5");
+            register float16x8_t b2  asm("v6");
 
-            __asm __volatile(
-                // Enable FP16 extensions
-                ".arch    armv8.2-a+fp16\n"
+            __asm __volatile (
+                // Enable FP16 instruction support (but only if it's not already on).
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                ".arch	armv8.2-a+fp16\n"
+#endif
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.8h, #0x0\n"
-                "ldr    %d[a0], [%[a_ptr]]\n"
-                "movi    v9.8h, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.8h, #0x0\n"
-                "ldr    %d[a1], [%[a_ptr], #8]\n"
-                "movi    v11.8h, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v12.8h, #0x0\n"
-                "movi    v13.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]")
-                "movi    v14.8h, #0x0\n"
-                "movi    v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]")
-                "movi    v16.8h, #0x0\n"
-                "movi    v17.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]")
-                "movi    v18.8h, #0x0\n"
-                "movi    v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]")
-                "movi    v20.8h, #0x0\n"
-                "movi    v21.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
-                "movi    v22.8h, #0x0\n"
-                "movi    v23.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi    v24.8h, #0x0\n"
-                "movi    v25.8h, #0x0\n"
-                "movi    v26.8h, #0x0\n"
-                "movi    v27.8h, #0x0\n"
-                "movi    v28.8h, #0x0\n"
-                "movi    v29.8h, #0x0\n"
-                "movi    v30.8h, #0x0\n"
-                "movi    v31.8h, #0x0\n"
+                "movi	v8.8h, #0x0\n"
+                "ldr	%d[a0], [%[a_ptr]]\n"
+                "movi	v9.8h, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.8h, #0x0\n"
+                "ldr	%d[a1], [%[a_ptr], #8]\n"
+                "movi	v11.8h, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.8h, #0x0\n"
+                "movi	v13.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v14.8h, #0x0\n"
+                "movi	v15.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v16.8h, #0x0\n"
+                "movi	v17.8h, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v18.8h, #0x0\n"
+                "movi	v19.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v20.8h, #0x0\n"
+                "movi	v21.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v22.8h, #0x0\n"
+                "movi	v23.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v24.8h, #0x0\n"
+                "movi	v25.8h, #0x0\n"
+                "movi	v26.8h, #0x0\n"
+                "movi	v27.8h, #0x0\n"
+                "movi	v28.8h, #0x0\n"
+                "movi	v29.8h, #0x0\n"
+                "movi	v30.8h, #0x0\n"
+                "movi	v31.8h, #0x0\n"
 
                 // The loop is offset by these two instructions which must
                 // always be executed.
-                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 "1:\n"
-                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #16]\n"
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #16]\n"
 
-                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n"
-                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #24]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
+                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #24]\n"
 
-                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
 
-                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n"
-                "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCH("[%[b_ptr], #384]")
-                "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
-                "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
+                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
+                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
                 // Unroll 1
-                "fmla     v8.8h , %[b0].8h, %[a0a].h[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v9.8h , %[b0].8h, %[a0a].h[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v10.8h, %[b0].8h, %[a0a].h[2]\n"
-                "fmla    v11.8h, %[b0].8h, %[a0a].h[3]\n"
-                "ldr    %d[a0], [%[a_ptr], #32]\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #32]\n"
 
-                "fmla     v12.8h, %[b0].8h, %[a1a].h[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.8h, %[b0].8h, %[a1a].h[1]\n"
-                "fmla    v14.8h, %[b0].8h, %[a1a].h[2]\n"
-                "fmla    v15.8h, %[b0].8h, %[a1a].h[3]\n"
-                "ldr    %d[a1], [%[a_ptr], #40]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a1a].h[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.8h, %[b0].8h, %[a1a].h[1]\n"
+                "fmla	v14.8h, %[b0].8h, %[a1a].h[2]\n"
+                "fmla	v15.8h, %[b0].8h, %[a1a].h[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #40]\n"
 
-                "fmla    v16.8h, %[b1].8h, %[a0a].h[0]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v17.8h, %[b1].8h, %[a0a].h[1]\n"
-                "fmla    v18.8h, %[b1].8h, %[a0a].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0a].h[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
+                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
 
-                "fmla    v20.8h, %[b1].8h, %[a1a].h[0]\n"
-                "fmla    v21.8h, %[b1].8h, %[a1a].h[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "fmla    v22.8h, %[b1].8h, %[a1a].h[2]\n"
-                "fmla    v23.8h, %[b1].8h, %[a1a].h[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "fmla	v20.8h, %[b1].8h, %[a1a].h[0]\n"
+                "fmla	v21.8h, %[b1].8h, %[a1a].h[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v22.8h, %[b1].8h, %[a1a].h[2]\n"
+                "fmla	v23.8h, %[b1].8h, %[a1a].h[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
 
-                "fmla    v24.8h, %[b2].8h, %[a0a].h[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v25.8h, %[b2].8h, %[a0a].h[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "fmla    v26.8h, %[b2].8h, %[a0a].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0a].h[3]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
 
-                "fmla    v28.8h, %[b2].8h, %[a1a].h[0]\n" ASM_PREFETCH("[%[b_ptr], #448]")
-                "fmla    v29.8h, %[b2].8h, %[a1a].h[1]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v30.8h, %[b2].8h, %[a1a].h[2]\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v31.8h, %[b2].8h, %[a1a].h[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla	v28.8h, %[b2].8h, %[a1a].h[0]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla	v29.8h, %[b2].8h, %[a1a].h[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v30.8h, %[b2].8h, %[a1a].h[2]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v31.8h, %[b2].8h, %[a1a].h[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
 
-                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "bne    1b\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "bne	1b\n"
 
                 "4:\n"
 
                 // Start final iteration - branch off to "odd" code before we load a0a
-                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "cbnz    %w[oddk], 2f\n"
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Even K continuation
-                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #16]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #16]\n"
 
-                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr]]")
-                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #24]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #24]\n"
 
-                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
 
-                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n"
-                "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
-                "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
+                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
+                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "fmla     v8.8h , %[b0].8h, %[a0a].h[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v9.8h , %[b0].8h, %[a0a].h[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v10.8h, %[b0].8h, %[a0a].h[2]\n"
-                "fmla    v11.8h, %[b0].8h, %[a0a].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
 
-                "fmla     v12.8h, %[b0].8h, %[a1a].h[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.8h, %[b0].8h, %[a1a].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla    v14.8h, %[b0].8h, %[a1a].h[2]\n"
-                "fmla    v15.8h, %[b0].8h, %[a1a].h[3]\n"
-                "ldr    %d[a1], [%[a_ptr], #40]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a1a].h[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.8h, %[b0].8h, %[a1a].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla	v14.8h, %[b0].8h, %[a1a].h[2]\n"
+                "fmla	v15.8h, %[b0].8h, %[a1a].h[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #40]\n"
 
-                "fmla    v16.8h, %[b1].8h, %[a0a].h[0]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v17.8h, %[b1].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla    v18.8h, %[b1].8h, %[a0a].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
 
-                "fmla    v20.8h, %[b1].8h, %[a1a].h[0]\n"
-                "fmla    v21.8h, %[b1].8h, %[a1a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
-                "fmla    v22.8h, %[b1].8h, %[a1a].h[2]\n"
-                "fmla    v23.8h, %[b1].8h, %[a1a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "fmla	v20.8h, %[b1].8h, %[a1a].h[0]\n"
+                "fmla	v21.8h, %[b1].8h, %[a1a].h[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla	v22.8h, %[b1].8h, %[a1a].h[2]\n"
+                "fmla	v23.8h, %[b1].8h, %[a1a].h[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
 
-                "fmla    v24.8h, %[b2].8h, %[a0a].h[0]\n"
-                "fmla    v25.8h, %[b2].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
-                "fmla    v26.8h, %[b2].8h, %[a0a].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
 
-                "fmla    v28.8h, %[b2].8h, %[a1a].h[0]\n"
-                "fmla    v29.8h, %[b2].8h, %[a1a].h[1]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v30.8h, %[b2].8h, %[a1a].h[2]\n"
-                "fmla    v31.8h, %[b2].8h, %[a1a].h[3]\n"
-                "b    3f\n"
+                "fmla	v28.8h, %[b2].8h, %[a1a].h[0]\n"
+                "fmla	v29.8h, %[b2].8h, %[a1a].h[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v30.8h, %[b2].8h, %[a1a].h[2]\n"
+                "fmla	v31.8h, %[b2].8h, %[a1a].h[3]\n"
+                "b	3f\n"
 
                 "2:\n"
 
                 // Odd tail
-                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
 
-                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
-                "add    %[a_ptr], %[a_ptr], #16\n"
-                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla 	v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.8h, %[b0].8h, %[a1].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla	v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "add	%[a_ptr], %[a_ptr], #16\n"
+                "fmla	v15.8h, %[b0].8h, %[a1].h[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
 
-                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
 
-                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
-                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
-                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla	v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla	v21.8h, %[b1].8h, %[a1].h[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla	v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla	v23.8h, %[b1].8h, %[a1].h[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
 
-                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
 
-                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla	v28.8h, %[b2].8h, %[a1].h[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla	v29.8h, %[b2].8h, %[a1].h[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "fmla	v30.8h, %[b2].8h, %[a1].h[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla	v31.8h, %[b2].8h, %[a1].h[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
 
                 // Common tail
                 // A55 won't dual issue these stores with anything else, so
                 // simplest to do them all in this common code.
                 "3:\n"
-                "str    q8,  [%[c_ptr]]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "str    q24, [%[c_ptr], #32]\n"
-                "str    q9,  [%[c_ptr], #48]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "str    q10, [%[c_ptr], #96]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "str    q11, [%[c_ptr], #144]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "str    q12, [%[c_ptr], #192]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "str    q13, [%[c_ptr], #240]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "str    q14, [%[c_ptr], #288]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "str    q15, [%[c_ptr], #336]\n"
-                "str    q23, [%[c_ptr], #352]\n"
-                "str    q31, [%[c_ptr], #368]\n"
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
                 "5:\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "=w"(a0), [a0a] "=w"(a0a), [a1] "=w"(a1), [a1a] "=w"(a1a),
-                [b0] "=w"(b0), [b1] "=w"(b1), [b2] "=w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "=w" (a0), [a0a] "=w" (a0a), [a1] "=w" (a1), [a1a] "=w" (a1a),
+              [b0] "=w" (b0), [b1] "=w" (b1), [b2] "=w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index 65a5d43..418a375 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp

@@ -39,297 +39,311 @@
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
-namespace arm_gemm
-{
-void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
     const __fp16 *a_ptr = Apanel;
-    __fp16       *c_ptr = Cpanel;
+    __fp16 *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const __fp16 *a_ptr0 = a_ptr;
-        const __fp16 *b_ptr  = Bpanel;
+        const __fp16 *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             // Fix up for odd lengths - set a flag if K is odd, but make
             // sure we round up the iteration count.
             int oddk = (K & 1);
-            int k    = ((K + 1) / 2) - 1;
+            int k = ((K+1)/2) - 1;
 
-            register float16x8_t a0 asm("v0");
+            register float16x8_t a0  asm("v0");
             register float16x8_t a0a asm("v1");
-            register float16x8_t b0 asm("v2");
-            register float16x8_t b1 asm("v3");
-            register float16x8_t b2 asm("v4");
+            register float16x8_t b0  asm("v2");
+            register float16x8_t b1  asm("v3");
+            register float16x8_t b2  asm("v4");
             register float16x8_t b0a asm("v5");
             register float16x8_t b1a asm("v6");
             register float16x8_t b2a asm("v7");
 
-            __asm __volatile(
-                ".arch    armv8.2-a+fp16\n"
+            __asm __volatile (
+                // Enable FP16 instruction support (but only if it's not already on).
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                ".arch	armv8.2-a+fp16\n"
+#endif
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.8h, #0x0\n"
-                "ldr    %q[a0], [%[a_ptr]]\n"
-                "movi    v9.8h, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.8h, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v11.8h, #0x0\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "movi    v12.8h, #0x0\n"
-                "ldr    %q[b0a], [%[b_ptr], #48]\n"
-                "movi    v13.8h, #0x0\n"
-                "ldr    %q[b1a], [%[b_ptr], #64]\n"
-                "movi    v14.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v16.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v17.8h, #0x0\n"
-                ASM_PREFETCH("[%[b_ptr], #192]") "movi    v18.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
-                "movi    v20.8h, #0x0\n"
-                "movi    v21.8h, #0x0\n"
-                "movi    v22.8h, #0x0\n"
-                "movi    v23.8h, #0x0\n"
-                "movi    v24.8h, #0x0\n"
-                "movi    v25.8h, #0x0\n"
-                "movi    v26.8h, #0x0\n"
-                "movi    v27.8h, #0x0\n"
-                "movi    v28.8h, #0x0\n"
-                "movi    v29.8h, #0x0\n"
-                "movi    v30.8h, #0x0\n"
-                "movi    v31.8h, #0x0\n"
+                "movi	v8.8h, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.8h, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.8h, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v11.8h, #0x0\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "movi	v12.8h, #0x0\n"
+                "ldr	%q[b0a], [%[b_ptr], #48]\n"
+                "movi	v13.8h, #0x0\n"
+                "ldr	%q[b1a], [%[b_ptr], #64]\n"
+                "movi	v14.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v15.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v16.8h, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v17.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v18.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v19.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.8h, #0x0\n"
+                "movi	v21.8h, #0x0\n"
+                "movi	v22.8h, #0x0\n"
+                "movi	v23.8h, #0x0\n"
+                "movi	v24.8h, #0x0\n"
+                "movi	v25.8h, #0x0\n"
+                "movi	v26.8h, #0x0\n"
+                "movi	v27.8h, #0x0\n"
+                "movi	v28.8h, #0x0\n"
+                "movi	v29.8h, #0x0\n"
+                "movi	v30.8h, #0x0\n"
+                "movi	v31.8h, #0x0\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 "1:\n"
-                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr    %q[a0a], [%[a_ptr], #16]\n"
-                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr    %q[b2a], [%[b_ptr], #80]\n"
-                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "fmla    v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
-                "ldr    %q[b0], [%[b_ptr], #96]\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr	%q[a0a], [%[a_ptr], #16]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%q[b2a], [%[b_ptr], #80]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla	v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
 
-                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[a_ptr], #128]")
-                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
 
-                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[b_ptr], #288]")
-                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
-                "ldr    %q[a0], [%[a_ptr], #32]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #288]")
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "ldr	%q[a0], [%[a_ptr], #32]\n"
 
-                "fmla     v8.8h , %[b0a].8h, %[a0a].h[0]\n"
-                "fmla    v9.8h , %[b0a].8h, %[a0a].h[1]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "fmla    v10.8h, %[b0a].8h, %[a0a].h[2]\n"
-                "fmla    v11.8h, %[b0a].8h, %[a0a].h[3]\n"
-                "fmla     v12.8h, %[b0a].8h, %[a0a].h[4]\n"
-                "fmla    v13.8h, %[b0a].8h, %[a0a].h[5]\n"
-                "fmla    v14.8h, %[b0a].8h, %[a0a].h[6]\n"
-                "fmla    v15.8h, %[b0a].8h, %[a0a].h[7]\n"
-                "ldr    %q[b0a], [%[b_ptr], #48]\n"
+                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "fmla	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "ldr	%q[b0a], [%[b_ptr], #48]\n"
 
-                "fmla    v16.8h, %[b1a].8h, %[a0a].h[0]\n"
-                "fmla    v17.8h, %[b1a].8h, %[a0a].h[1]\n" ASM_PREFETCH("[%[b_ptr], #352]")
-                "fmla    v18.8h, %[b1a].8h, %[a0a].h[2]\n"
-                "fmla    v19.8h, %[b1a].8h, %[a0a].h[3]\n"
-                "fmla    v20.8h, %[b1a].8h, %[a0a].h[4]\n"
-                "fmla    v21.8h, %[b1a].8h, %[a0a].h[5]\n"
-                "fmla    v22.8h, %[b1a].8h, %[a0a].h[6]\n"
-                "fmla    v23.8h, %[b1a].8h, %[a0a].h[7]\n"
-                "ldr    %q[b1a], [%[b_ptr], #64]\n"
+                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #352]")
+                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "ldr	%q[b1a], [%[b_ptr], #64]\n"
 
-                "fmla    v24.8h, %[b2a].8h, %[a0a].h[0]\n"
-                "fmla    v25.8h, %[b2a].8h, %[a0a].h[1]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v26.8h, %[b2a].8h, %[a0a].h[2]\n"
-                "fmla    v27.8h, %[b2a].8h, %[a0a].h[3]\n"
-                "fmla    v28.8h, %[b2a].8h, %[a0a].h[4]\n"
-                "fmla    v29.8h, %[b2a].8h, %[a0a].h[5]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v30.8h, %[b2a].8h, %[a0a].h[6]\n"
-                "fmla    v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
 
-                "bne    1b\n"
+                "bne	1b\n"
                 "4:\n"
 
                 // Jump to odd tail if necessary.
-                "cbnz    %w[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Even tail.
-                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
                 "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "ldr    %q[a0a], [%[a_ptr], #16]\n"
-                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "ldr    %q[b2a], [%[b_ptr], #80]\n"
-                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "ldr	%q[a0a], [%[a_ptr], #16]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%q[b2a], [%[b_ptr], #80]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
                 "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
 
-                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
 
-                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
 
-                "fmla     v8.8h , %[b0a].8h, %[a0a].h[0]\n"
-                "fmla    v16.8h, %[b1a].8h, %[a0a].h[0]\n"
-                "str    q8, [%[c_ptr]]\n"
-                "fmla    v24.8h, %[b2a].8h, %[a0a].h[0]\n"
-                "str    q16, [%[c_ptr], #16]\n"
+                "fmla 	v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla	v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "str	q8, [%[c_ptr]]\n"
+                "fmla	v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
 
-                "fmla      v9.8h , %[b0a].8h, %[a0a].h[1]\n"
-                "str    q24, [%[c_ptr], #32]\n"
-                "fmla    v17.8h, %[b1a].8h, %[a0a].h[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
-                "fmla    v25.8h, %[b2a].8h, %[a0a].h[1]\n"
-                "str    q17, [%[c_ptr], #64]\n"
+                "fmla  	v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "str	q17, [%[c_ptr], #64]\n"
 
-                "fmla    v10.8h, %[b0a].8h, %[a0a].h[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "fmla    v18.8h, %[b1a].8h, %[a0a].h[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
-                "fmla    v26.8h, %[b2a].8h, %[a0a].h[2]\n"
-                "str    q18, [%[c_ptr], #112]\n"
+                "fmla	v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "fmla	v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "str	q18, [%[c_ptr], #112]\n"
 
-                "fmla    v11.8h, %[b0a].8h, %[a0a].h[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "fmla    v19.8h, %[b1a].8h, %[a0a].h[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
-                "fmla    v27.8h, %[b2a].8h, %[a0a].h[3]\n"
-                "str    q19, [%[c_ptr], #160]\n"
+                "fmla	v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "fmla	v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "str	q19, [%[c_ptr], #160]\n"
 
-                "fmla     v12.8h, %[b0a].8h, %[a0a].h[4]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "fmla    v20.8h, %[b1a].8h, %[a0a].h[4]\n"
-                "str    q12, [%[c_ptr], #192]\n"
-                "fmla    v28.8h, %[b2a].8h, %[a0a].h[4]\n"
-                "str    q20, [%[c_ptr], #208]\n"
+                "fmla 	v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "fmla	v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "str	q20, [%[c_ptr], #208]\n"
 
-                "fmla      v13.8h, %[b0a].8h, %[a0a].h[5]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "fmla    v21.8h, %[b1a].8h, %[a0a].h[5]\n"
-                "str    q13, [%[c_ptr], #240]\n"
-                "fmla    v29.8h, %[b2a].8h, %[a0a].h[5]\n"
-                "str    q21, [%[c_ptr], #256]\n"
+                "fmla  	v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "fmla	v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "str	q21, [%[c_ptr], #256]\n"
 
-                "fmla    v14.8h, %[b0a].8h, %[a0a].h[6]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "fmla    v22.8h, %[b1a].8h, %[a0a].h[6]\n"
-                "str    q14, [%[c_ptr], #288]\n"
-                "fmla    v30.8h, %[b2a].8h, %[a0a].h[6]\n"
-                "str    q22, [%[c_ptr], #304]\n"
+                "fmla	v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "fmla	v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "str	q22, [%[c_ptr], #304]\n"
 
-                "fmla    v15.8h, %[b0a].8h, %[a0a].h[7]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "fmla    v23.8h, %[b1a].8h, %[a0a].h[7]\n"
-                "str    q15, [%[c_ptr], #336]\n"
-                "fmla    v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-                "b    3f\n"
+                "fmla	v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "fmla	v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+                "b	3f\n"
 
                 // Odd tail
                 "2:\n"
-                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
-                "add    %[a_ptr], %[a_ptr], #16\n"
-                "str    q8, [%[c_ptr]]\n"
-                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
-                "str    q16, [%[c_ptr], #16]\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add	%[a_ptr], %[a_ptr], #16\n"
+                "str	q8, [%[c_ptr]]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
 
-                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
-                "str    q24, [%[c_ptr], #32]\n"
-                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
-                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
-                "str    q17, [%[c_ptr], #64]\n"
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "str	q17, [%[c_ptr], #64]\n"
 
-                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
-                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
-                "str    q18, [%[c_ptr], #112]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "str	q18, [%[c_ptr], #112]\n"
 
-                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
-                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
-                "str    q19, [%[c_ptr], #160]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "str	q19, [%[c_ptr], #160]\n"
 
-                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
-                "str    q12, [%[c_ptr], #192]\n"
-                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
-                "str    q20, [%[c_ptr], #208]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "str	q20, [%[c_ptr], #208]\n"
 
-                "fmla      v13.8h, %[b0].8h, %[a0].h[5]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
-                "str    q13, [%[c_ptr], #240]\n"
-                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
-                "str    q21, [%[c_ptr], #256]\n"
+                "fmla  	v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "str	q21, [%[c_ptr], #256]\n"
 
-                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
-                "str    q14, [%[c_ptr], #288]\n"
-                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
-                "str    q22, [%[c_ptr], #304]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "str	q22, [%[c_ptr], #304]\n"
 
-                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
-                "str    q15, [%[c_ptr], #336]\n"
-                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
 
                 "3:\n"
-                "str    q23, [%[c_ptr], #352]\n"
-                "str    q31, [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a0a] "+w"(a0a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k),
-                [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a0a] "+w" (a0a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k),
+              [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 91a9e8d..c91d504 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp

@@ -25,8 +25,8 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
 void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
@@ -41,8 +41,7 @@
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class sgemm_12x8
-{
+class sgemm_12x8 {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -51,26 +50,24 @@
 
     /* Describes the data layout for A input */
     static const int A_interleave = 8;
-    static const int A_block      = 1;
-    static const int A_transpose  = 0;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
 
     /* Same for B input */
     static const int B_interleave = 12;
-    static const int B_block      = 1;
-    static const int B_transpose  = 1;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
 
     /* Kernel blocking parameters */
-    static const int out_width  = 12;
+    static const int out_width = 12;
     static const int out_height = 8;
-    static const int k_unroll   = 1;
+    static const int k_unroll = 1;
 
-    kern_type kernel = a64_sgemm_asimd_12x8;
+    kern_type kernel=a64_sgemm_asimd_12x8;
 
-    sgemm_12x8(const CPUInfo *ci)
-    {
+    sgemm_12x8(const CPUInfo *ci) {
         // Select specific kernel if available
-        switch(ci->get_cpu_model())
-        {
+        switch(ci->get_cpu_model()) {
             case CPUModel::A53:
                 kernel = a64_sgemm_asimd_12x8_a53;
                 break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
index 618ebc7..2400191 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp

@@ -27,333 +27,347 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
-void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             // Fix up for odd lengths - set a flag if K is odd, but make
             // sure we round up the iteration count.
             int oddk = (K & 1);
-            int k    = ((K + 1) / 2) - 1;
+            int k = ((K+1)/2) - 1;
 
-            register float32x4_t a0 asm("v0");
-            register float32x4_t a1 asm("v1");
-            register float32x4_t b0 asm("v2");
-            register float32x4_t b1 asm("v3");
-            register float32x4_t b2 asm("v4");
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
             register float32x4_t a0a asm("v5");
             register float32x4_t a1a asm("v6");
 
-            __asm __volatile(
+            __asm __volatile (
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.4s, #0x0\n"
-                "ldr    %q[a0], [%[a_ptr]]\n"
-                "movi    v9.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.4s, #0x0\n"
-                "ldr    %q[a1], [%[a_ptr], #16]\n"
-                "movi    v11.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
                 ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi    v22.4s, #0x0\n"
-                "movi    v23.4s, #0x0\n"
-                "movi    v24.4s, #0x0\n"
-                "movi    v25.4s, #0x0\n"
-                "movi    v26.4s, #0x0\n"
-                "movi    v27.4s, #0x0\n"
-                "movi    v28.4s, #0x0\n"
-                "movi    v29.4s, #0x0\n"
-                "movi    v30.4s, #0x0\n"
-                "movi    v31.4s, #0x0\n"
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 "1:\n"
                 // Unroll 0
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
                 "nop\n"
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
 
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
 
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
 
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
 
                 ASM_PREFETCH("[%[a_ptr], #320]")
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
 
                 ASM_PREFETCH("[%[b_ptr], #448]")
                 "nop\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
 
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
                 "nop\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
 
                 ASM_PREFETCH("[%[b_ptr], #512]")
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
                 // Unroll 1
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
                 "nop\n"
-                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
 
-                "ldr    %d[a0], [%[a_ptr], #64]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
 
-                "ldr    %d[a1], [%[a_ptr], #80]\n"
-                "ins    %[a0].d[1], x20\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+                "ins	%[a0].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
 
-                "ldr    %d[b0], [%[b_ptr], #96]\n"
-                "ins    %[a1].d[1], x20\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+                "ins	%[a1].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
 
                 "nop\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
 
                 "nop\n"
                 "nop\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
 
-                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
                 "nop\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
 
                 "nop\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
 
-                "bne    1b\n"
+                "bne	1b\n"
 
                 // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
                 "4:\n"
-                "cbnz    %w[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration. (even K)
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
                 "nop\n"
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
 
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
 
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
 
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
 
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
 
                 "nop\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
 
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
                 "nop\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
 
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
                 "nop\n"
-                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
 
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b    3f\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
 
                 // Detached final iteration. (odd K)
                 "2:\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
                 "nop\n"
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
 
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
                 // Common tail
                 "3:\n"
-                "str    q8,  [%[c_ptr]]\n"
-                "str    q16,  [%[c_ptr], #16]\n"
-                "str    q24,  [%[c_ptr], #32]\n"
-                "str    q9,  [%[c_ptr], #48]\n"
-                "str    q17,  [%[c_ptr], #64]\n"
-                "str    q25,  [%[c_ptr], #80]\n"
-                "str    q10,  [%[c_ptr], #96]\n"
-                "str    q18,  [%[c_ptr], #112]\n"
-                "str    q26,  [%[c_ptr], #128]\n"
-                "str    q11,  [%[c_ptr], #144]\n"
-                "str    q19,  [%[c_ptr], #160]\n"
-                "str    q27,  [%[c_ptr], #176]\n"
-                "str    q12,  [%[c_ptr], #192]\n"
-                "str    q20,  [%[c_ptr], #208]\n"
-                "str    q28,  [%[c_ptr], #224]\n"
-                "str    q13,  [%[c_ptr], #240]\n"
-                "str    q21,  [%[c_ptr], #256]\n"
-                "str    q29,  [%[c_ptr], #272]\n"
-                "str    q14,  [%[c_ptr], #288]\n"
-                "str    q22,  [%[c_ptr], #304]\n"
-                "str    q30,  [%[c_ptr], #320]\n"
-                "str    q15,  [%[c_ptr], #336]\n"
-                "str    q23,  [%[c_ptr], #352]\n"
-                "str    q31,  [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
index 4ca25eb..d9aaee1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp

@@ -27,326 +27,348 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
-void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
-{
+namespace arm_gemm {
+
+void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             // Fix up for odd lengths - set a flag if K is odd, but make
             // sure we round up the iteration count.
             int oddk = (K & 1);
-            int k    = ((K + 1) / 2) - 1;
+            int k = ((K+1)/2) - 1;
 
-            register float32x4_t a0 asm("v0");
-            register float32x4_t a1 asm("v1");
-            register float32x4_t b0 asm("v2");
-            register float32x4_t b1 asm("v3");
-            register float32x4_t b2 asm("v4");
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
             register float32x4_t a0a asm("v5");
             register float32x4_t a1a asm("v6");
 
-            __asm __volatile(
+            __asm __volatile (
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.4s, #0x0\n"
-                "ldr    %q[a0], [%[a_ptr]]\n"
-                "movi    v9.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.4s, #0x0\n"
-                "ldr    %q[a1], [%[a_ptr], #16]\n"
-                "movi    v11.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
                 ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi    v22.4s, #0x0\n"
-                "movi    v23.4s, #0x0\n"
-                "movi    v24.4s, #0x0\n"
-                "movi    v25.4s, #0x0\n"
-                "movi    v26.4s, #0x0\n"
-                "movi    v27.4s, #0x0\n"
-                "movi    v28.4s, #0x0\n"
-                "movi    v29.4s, #0x0\n"
-                "movi    v30.4s, #0x0\n"
-                "movi    v31.4s, #0x0\n"
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 "1:\n"
                 // Unroll 0
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
 
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "subs    %w[k], %w[k], #1\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "subs	%w[k], %w[k], #1\n"
 
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
-                "ins    %[b2].d[1], x20\n"
 
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
 
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
-                "ins    %[a0a].d[1], x20\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
 
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
-                "ins    %[a1a].d[1], x20\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
 
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
-                "ins    %[b0].d[1], x20\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "ins	%[b0].d[1], x20\n"
 
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+
+
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
                 // Unroll 1
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
-                "ins    %[b1].d[1], x20\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "ins	%[b1].d[1], x20\n"
 
-                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
 
-                "ldr    %d[a0], [%[a_ptr], #64]\n"
-                "ins    %[b2].d[1], x20\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+                "ins	%[b2].d[1], x20\n"
 
-                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
 
-                "ldr    %d[a1], [%[a_ptr], #80]\n"
-                "ins    %[a0].d[1], x20\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+                "ins	%[a0].d[1], x20\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
 
-                "ldr    %d[b0], [%[b_ptr], #96]\n"
-                "ins    %[a1].d[1], x20\n"
 
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+                "ins	%[a1].d[1], x20\n"
 
-                "ldr    %d[b1], [%[b_ptr], #112]\n"
-                "ins    %[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+                "ins	%[b0].d[1], x20\n"
 
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
 
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "ins    %[b1].d[1], x20\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
 
-                "bne    1b\n"
+
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "ins	%[b1].d[1], x20\n"
+
+
+                "bne	1b\n"
 
                 // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
                 "4:\n"
-                "cbnz    %w[oddk], 2f\n"
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "cbnz	%w[oddk], 2f\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
 
                 // Detached final iteration. (even K)
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
 
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
-                "ins    %[b2].d[1], x20\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
 
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
 
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
-                "ins    %[a0a].d[1], x20\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
 
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
-                "ins    %[a1a].d[1], x20\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
 
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
 
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
-                "ins    %[b0].d[1], x20\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "ins	%[b0].d[1], x20\n"
 
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
-                "ins    %[b1].d[1], x20\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "ins	%[b1].d[1], x20\n"
 
-                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
 
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b    3f\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
 
                 // Detached final iteration. (odd K)
                 "2:\n"
 
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
 
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
                 // Common tail
                 "3:\n"
-                "str    q8,  [%[c_ptr]]\n"
-                "str    q16,  [%[c_ptr], #16]\n"
-                "str    q24,  [%[c_ptr], #32]\n"
-                "str    q9,  [%[c_ptr], #48]\n"
-                "str    q17,  [%[c_ptr], #64]\n"
-                "str    q25,  [%[c_ptr], #80]\n"
-                "str    q10,  [%[c_ptr], #96]\n"
-                "str    q18,  [%[c_ptr], #112]\n"
-                "str    q26,  [%[c_ptr], #128]\n"
-                "str    q11,  [%[c_ptr], #144]\n"
-                "str    q19,  [%[c_ptr], #160]\n"
-                "str    q27,  [%[c_ptr], #176]\n"
-                "str    q12,  [%[c_ptr], #192]\n"
-                "str    q20,  [%[c_ptr], #208]\n"
-                "str    q28,  [%[c_ptr], #224]\n"
-                "str    q13,  [%[c_ptr], #240]\n"
-                "str    q21,  [%[c_ptr], #256]\n"
-                "str    q29,  [%[c_ptr], #272]\n"
-                "str    q14,  [%[c_ptr], #288]\n"
-                "str    q22,  [%[c_ptr], #304]\n"
-                "str    q30,  [%[c_ptr], #320]\n"
-                "str    q15,  [%[c_ptr], #336]\n"
-                "str    q23,  [%[c_ptr], #352]\n"
-                "str    q31,  [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
index 89fe6ac..114c807 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp

@@ -27,37 +27,34 @@
 
 #include "../../asmlib.hpp"
 
-namespace arm_gemm
-{
-void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K)
-{
+namespace arm_gemm {
+
+void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
     // Fix up for odd lengths - set a flag if K is odd, but make
     // sure we round up the iteration count.
-    int oddk    = (K & 1);
-    int k_iters = ((K + 1) / 2) - 1;
+    int oddk = (K & 1);
+    int k_iters = ((K+1)/2) - 1;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             int k = k_iters;
 
-            register float32x4_t a0 asm("v0");
-            register float32x4_t a1 asm("v1");
-            register float32x4_t b0 asm("v2");
-            register float32x4_t b1 asm("v3");
-            register float32x4_t b2 asm("v4");
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
             register float32x4_t a0a asm("v5");
             register float32x4_t a1a asm("v6");
 
-            __asm __volatile(
+            __asm __volatile (
                 // Initialize result registers, load initial operands, prime prefetches.
                 "movi   v8.4s, #0x0\n"
                 "ldr    %q[a0], [%[a_ptr]]\n"
@@ -67,272 +64,319 @@
                 "ldr    %q[a1], [%[a_ptr], #16]\n"
                 "movi   v11.4s, #0x0\n"
                 "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi   v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi   v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi   v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi   v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
                 "movi   v18.4s, #0x0\n"
-                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
                 "movi   v20.4s, #0x0\n"
-                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
                 "movi   v22.4s, #0x0\n"
-                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v23.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
                 "movi   v24.4s, #0x0\n"
-                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v25.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
                 "movi   v26.4s, #0x0\n"
-                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
                 "movi   v28.4s, #0x0\n"
-                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v29.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #384]")
                 "movi   v30.4s, #0x0\n"
-                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "movi   v31.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
 
                 // The loop is offset by these two instructions which must
                 // always be executed.
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 "1:\n"
                 // Unroll 0
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
 
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #448]")
 
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #576]")
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
                 // Unroll 1
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
 
-                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "ldr    x20, [%[a_ptr], #72]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "ins    %[a0].d[1], x20\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[a_ptr], #88]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "ins	%[a0].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
 
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "ins    %[a1].d[1], x20\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #104]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "ins	%[a1].d[1], x20\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #120]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
 
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
 
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                ASM_PREFETCH("[%[b_ptr], #640]")
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
 
-                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "b.ne    1b\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "b.ne	1b\n"
 
                 // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
                 "4:\n"
 
-                // Start final iteration - branch off to "odd" code before we load a0a.
-                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[b_ptr], #40]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "cbnz    %w[oddk], 2f\n"
+		// Start final iteration - branch off to "odd" code before we load a0a.
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Even K continuation
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
 
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[a_ptr], #40]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "ins    %[a0a].d[1], x20\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "ldr    x20, [%[a_ptr], #56]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
 
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "ins    %[a1a].d[1], x20\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[b_ptr], #56]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
 
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "ins    %[b0].d[1], x20\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "ldr    x20, [%[b_ptr], #72]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
 
-                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "ins    %[b1].d[1], x20\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr    x20, [%[b_ptr], #88]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "ins    %[b2].d[1], x20\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "ins	%[b2].d[1], x20\n"
 
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
-                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "b    3f\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
 
                 // Odd K continuation
                 "2:\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
-                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "ins    %[b2].d[1], x20\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
 
                 // Common tail
                 "3:\n"
-                "str    q8,   [%[c_ptr]]\n"
-                "str    q16,  [%[c_ptr], #16]\n"
-                "str    q24,  [%[c_ptr], #32]\n"
-                "str    q9,   [%[c_ptr], #48]\n"
-                "str    q17,  [%[c_ptr], #64]\n"
-                "str    q25,  [%[c_ptr], #80]\n"
-                "str    q10,  [%[c_ptr], #96]\n"
-                "str    q18,  [%[c_ptr], #112]\n"
-                "str    q26,  [%[c_ptr], #128]\n"
-                "str    q11,  [%[c_ptr], #144]\n"
-                "str    q19,  [%[c_ptr], #160]\n"
-                "str    q27,  [%[c_ptr], #176]\n"
-                "str    q12,  [%[c_ptr], #192]\n"
-                "str    q20,  [%[c_ptr], #208]\n"
-                "str    q28,  [%[c_ptr], #224]\n"
-                "str    q13,  [%[c_ptr], #240]\n"
-                "str    q21,  [%[c_ptr], #256]\n"
-                "str    q29,  [%[c_ptr], #272]\n"
-                "str    q14,  [%[c_ptr], #288]\n"
-                "str    q22,  [%[c_ptr], #304]\n"
-                "str    q30,  [%[c_ptr], #320]\n"
-                "str    q15,  [%[c_ptr], #336]\n"
-                "str    q23,  [%[c_ptr], #352]\n"
-                "str    q31,  [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+                "str	q8,   [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,   [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
index 42e870e..7169c8b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp

@@ -37,311 +37,327 @@
 // Note that the intent of this is that either ablocks or bblocks will be 1
 // - this construction allows the output loop to proceed in either order.
 
-namespace arm_gemm
-{
-void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump = 0, long int block_jump = 0)
-{
+namespace arm_gemm {
+
+void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
     const float *a_ptr = Apanel;
-    float       *c_ptr = Cpanel;
+    float *c_ptr = Cpanel;
 
-    for(int yb = 0; yb < ablocks; yb++)
-    {
+    for (int yb=0; yb<ablocks; yb++) {
         const float *a_ptr0 = a_ptr;
-        const float *b_ptr  = Bpanel;
+        const float *b_ptr = Bpanel;
 
-        for(int xb = 0; xb < bblocks; xb++)
-        {
+        for (int xb=0; xb<bblocks; xb++) {
             a_ptr = a_ptr0;
             // Fix up for odd lengths - set a flag if K is odd, but make
             // sure we round up the iteration count.
             int oddk = (K & 1);
-            int k    = ((K + 1) / 2) - 1;
+            int k = ((K+1)/2) - 1;
 
-            register float32x4_t a0 asm("v0");
-            register float32x4_t a1 asm("v1");
-            register float32x4_t b0 asm("v2");
-            register float32x4_t b1 asm("v3");
-            register float32x4_t b2 asm("v4");
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
             register float32x4_t a0a asm("v5");
             register float32x4_t a1a asm("v6");
 
-            __asm __volatile(
+            __asm __volatile (
                 // Initialize result registers, load initial operands, prime prefetches.
-                "movi    v8.4s, #0x0\n"
-                "ldr    %q[a0], [%[a_ptr]]\n"
-                "movi    v9.4s, #0x0\n"
-                "ldr    %q[b0], [%[b_ptr]]\n"
-                "movi    v10.4s, #0x0\n"
-                "ldr    %q[a1], [%[a_ptr], #16]\n"
-                "movi    v11.4s, #0x0\n"
-                "ldr    %q[b1], [%[b_ptr], #16]\n"
-                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
-                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
                 ASM_PREFETCH("[%[b_ptr], #384]")
-                "movi    v22.4s, #0x0\n"
-                "movi    v23.4s, #0x0\n"
-                "movi    v24.4s, #0x0\n"
-                "movi    v25.4s, #0x0\n"
-                "movi    v26.4s, #0x0\n"
-                "movi    v27.4s, #0x0\n"
-                "movi    v28.4s, #0x0\n"
-                "movi    v29.4s, #0x0\n"
-                "movi    v30.4s, #0x0\n"
-                "movi    v31.4s, #0x0\n"
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
 
                 // Skip loop if we are doing zero iterations of it.
-                "cbz    %w[k], 4f\n"
+                "cbz	%w[k], 4f\n"
 
                 // Loop proper
                 "1:\n"
-                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "fmla      v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr    %q[a0a], [%[a_ptr], #32]\n"
-                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr    %q[a1a], [%[a_ptr], #48]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #48]\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla  	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #64]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr    %q[b2], [%[b_ptr], #80]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
 
-                "fmla     v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "ldr    %q[a0], [%[a_ptr], #64]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "fmla     v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
                 "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #96]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #112]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "bne    1b\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "bne	1b\n"
 
                 // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
                 "4:\n"
 
                 // Branch to alternative tail for odd K
-                "cbnz    %w[oddk], 2f\n"
+                "cbnz	%w[oddk], 2f\n"
 
                 // Detached final iteration (even K)
-                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
                 "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "ldr    %q[a0a], [%[a_ptr], #32]\n"
-                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
                 "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "ldr    %q[a1a], [%[a_ptr], #48]\n"
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "ldr    %q[b0], [%[b_ptr], #48]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
 
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "ldr    %q[b1], [%[b_ptr], #64]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
 
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #64\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "ldr    %q[b2], [%[b_ptr], #80]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
 
-                "fmla     v8.4s , %[b0].4s, %[a0a].s[0]\n"
-                "add    %[b_ptr], %[b_ptr], %[block_jump]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
-                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
                 "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
-                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
-                "str    q8, [%[c_ptr], #0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
-                "str    q24, [%[c_ptr], #32]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
 
-                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
-                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
 
-                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
 
-                "fmla     v12.4s, %[b0].4s, %[a1a].s[0]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
-                "str    q12, [%[c_ptr], #192]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
 
                 "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
-                "str    q13, [%[c_ptr], #240]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
 
-                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
-                "str    q14, [%[c_ptr], #288]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
 
-                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
-                "str    q15, [%[c_ptr], #336]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
 
-                "b    3f\n"
+                "b	3f\n"
 
                 // Detached final iteration (odd K)
                 "2:\n"
-                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
-                "ldr    %q[b2], [%[b_ptr], #32]\n"
-                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
-                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
                 "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
-                "str    q8, [%[c_ptr], #0]\n"
-                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
-                "str    q16, [%[c_ptr], #16]\n"
-                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
-                "add    %[b_ptr], %[b_ptr], #48\n"
-                "add    %[a_ptr], %[a_ptr], #32\n"
-                "str    q24, [%[c_ptr], #32]\n"
-                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
-                "str    q9, [%[c_ptr], #48]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
 
-                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
-                "str    q17, [%[c_ptr], #64]\n"
-                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
-                "str    q25, [%[c_ptr], #80]\n"
-                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
-                "str    q10, [%[c_ptr], #96]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
 
-                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
-                "str    q18, [%[c_ptr], #112]\n"
-                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
-                "str    q26, [%[c_ptr], #128]\n"
-                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
-                "str    q11, [%[c_ptr], #144]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
 
-                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
-                "str    q19, [%[c_ptr], #160]\n"
-                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
-                "str    q27, [%[c_ptr], #176]\n"
-                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
-                "str    q12, [%[c_ptr], #192]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
 
                 "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
-                "str    q20, [%[c_ptr], #208]\n"
-                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
-                "str    q28, [%[c_ptr], #224]\n"
-                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
-                "str    q13, [%[c_ptr], #240]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
 
-                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
-                "str    q21, [%[c_ptr], #256]\n"
-                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
-                "str    q29, [%[c_ptr], #272]\n"
-                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
-                "str    q14, [%[c_ptr], #288]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
 
-                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
-                "str    q22, [%[c_ptr], #304]\n"
-                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
-                "str    q30, [%[c_ptr], #320]\n"
-                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
-                "str    q15, [%[c_ptr], #336]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
 
                 // Common tail
                 "3:\n"
-                "str    q23, [%[c_ptr], #352]\n"
-                "str    q31, [%[c_ptr], #368]\n"
-                "add    %[c_ptr], %[c_ptr], #384\n"
-                :
-                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
-                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
-                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
-                : [oddk] "r"(oddk), [row_jump] "r"(row_jump), [block_jump] "r"(block_jump)
-                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
         }
     }
 }
 
-void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
-{
+void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
     a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
 }
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
index eceacc9..11a589d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp

@@ -25,8 +25,8 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_sgemm_native_16x4(const float *, int, const float *, int, float *, int, float, int, int, int);
 
@@ -38,8 +38,7 @@
 // All kernels in the family must share these characteristics.  The actual
 // kernel to be used can be chosen at runtime, based on the CPU_type
 // structure.
-class sgemm_native_16x4
-{
+class sgemm_native_16x4 {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -47,15 +46,15 @@
     typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
 
     /* Kernel blocking parameters */
-    static const int out_width  = 16;
+    static const int out_width = 16;
     static const int out_height = 4;
-    static const int k_unroll   = 1;
+    static const int k_unroll = 1;
 
     // Default to the generic kernel
-    kern_type kernel = a64_sgemm_native_16x4;
+    kern_type kernel=a64_sgemm_native_16x4;
 
-    sgemm_native_16x4(const CPUInfo *ci)
-    {
+    sgemm_native_16x4(const CPUInfo *ci) {
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
index 89a16f7..8325b3f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp

@@ -29,12 +29,11 @@
 
 #include <arm_neon.h>
 
-namespace arm_gemm
-{
-void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K)
-{
-    const int oddk    = ((K % 8) >= 4) ? 1 : 0;
-    const int beta0   = (beta == 0.0f) ? 1 : 0;
+namespace arm_gemm {
+
+void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K) {
+    const int oddk = ((K % 8) >= 4) ? 1 : 0;
+    const int beta0 = (beta == 0.0f) ? 1 : 0;
     const int oddones = (K % 4);
 
     float dummy_buffer[16];
@@ -67,12 +66,12 @@
 
             const float *b_ptr = B + x0;
 
-            int loops = ((K + 4) / 8) - 1;
-            int odds  = oddones;
+            int loops = ((K+4)/8) - 1;
+            int odds = oddones;
 
             size_t ldbb = ldb * sizeof(float);
 
-            __asm __volatile(
+            __asm __volatile (
                 "a0   .req v0\n"
                 "a1   .req v1\n"
                 "a2   .req v2\n"
@@ -107,140 +106,140 @@
                 "b2aq .req q14\n"
                 "b3aq .req q15\n"
 
-                "movi    v16.4s, #0x0\n"
-                "ldr    a0q, [%[a_ptr0]]\n"
-                "movi    v17.4s, #0x0\n"
-                "ldr    b0q, [%[b_ptr]]\n"
-                "movi    v18.4s, #0x0\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
-                "movi    v19.4s, #0x0\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
-                "movi    v20.4s, #0x0\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
-                "movi    v21.4s, #0x0\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "ldr    a1q, [%[a_ptr1]]\n"
-                "movi    v22.4s, #0x0\n"
-                "ldr    a2q, [%[a_ptr2]]\n"
-                "movi    v23.4s, #0x0\n"
-                "ldr    a3q, [%[a_ptr3]]\n"
-                "movi    v24.4s, #0x0\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
-                "movi    v25.4s, #0x0\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
-                "movi    v26.4s, #0x0\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
-                "cbz    %w[beta0], 5f\n"
-                "movi    v27.4s, #0x0\n"
-                "movi    v28.4s, #0x0\n"
-                "movi    v29.4s, #0x0\n"
-                "movi    v30.4s, #0x0\n"
-                "movi    v31.4s, #0x0\n"
+                "movi	v16.4s, #0x0\n"
+                "ldr	a0q, [%[a_ptr0]]\n"
+                "movi	v17.4s, #0x0\n"
+                "ldr	b0q, [%[b_ptr]]\n"
+                "movi	v18.4s, #0x0\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
+                "movi	v19.4s, #0x0\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
+                "movi	v20.4s, #0x0\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
+                "movi	v21.4s, #0x0\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "ldr	a1q, [%[a_ptr1]]\n"
+                "movi	v22.4s, #0x0\n"
+                "ldr	a2q, [%[a_ptr2]]\n"
+                "movi	v23.4s, #0x0\n"
+                "ldr	a3q, [%[a_ptr3]]\n"
+                "movi	v24.4s, #0x0\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
+                "movi	v25.4s, #0x0\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
+                "movi	v26.4s, #0x0\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
+                "cbz	%w[beta0], 5f\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
 
                 // Skip if no complete loops.
-                "cbz    %w[loops], 4f\n"
-                "b    1f\n"
+                "cbz	%w[loops], 4f\n"
+                "b	1f\n"
 
                 // If beta is non-zero, need to load and multiply by beta
                 "5:\n"
-                "ld1r    {v4.4s}, [%[betaptr]]\n"
-                "ldr    q16, [%[c_ptr0]]\n"
-                "ldr    q17, [%[c_ptr0], #16]\n"
-                "ldr    q18, [%[c_ptr0], #32]\n"
-                "ldr    q19, [%[c_ptr0], #48]\n"
+                "ld1r	{v4.4s}, [%[betaptr]]\n"
+                "ldr	q16, [%[c_ptr0]]\n"
+                "ldr	q17, [%[c_ptr0], #16]\n"
+                "ldr	q18, [%[c_ptr0], #32]\n"
+                "ldr	q19, [%[c_ptr0], #48]\n"
 
-                "ldr    q20, [%[c_ptr1]]\n"
-                "fmul    v16.4s, v16.4s, v4.4s\n"
-                "ldr    q21, [%[c_ptr1], #16]\n"
-                "fmul    v17.4s, v17.4s, v4.4s\n"
-                "ldr    q22, [%[c_ptr1], #32]\n"
-                "fmul    v18.4s, v18.4s, v4.4s\n"
-                "ldr    q23, [%[c_ptr1], #48]\n"
-                "fmul    v19.4s, v19.4s, v4.4s\n"
+                "ldr	q20, [%[c_ptr1]]\n"
+                "fmul	v16.4s, v16.4s, v4.4s\n"
+                "ldr	q21, [%[c_ptr1], #16]\n"
+                "fmul	v17.4s, v17.4s, v4.4s\n"
+                "ldr	q22, [%[c_ptr1], #32]\n"
+                "fmul	v18.4s, v18.4s, v4.4s\n"
+                "ldr	q23, [%[c_ptr1], #48]\n"
+                "fmul	v19.4s, v19.4s, v4.4s\n"
 
-                "ldr    q24, [%[c_ptr2]]\n"
-                "fmul    v20.4s, v20.4s, v4.4s\n"
-                "ldr    q25, [%[c_ptr2], #16]\n"
-                "fmul    v21.4s, v21.4s, v4.4s\n"
-                "ldr    q26, [%[c_ptr2], #32]\n"
-                "fmul    v22.4s, v22.4s, v4.4s\n"
-                "ldr    q27, [%[c_ptr2], #48]\n"
-                "fmul    v23.4s, v23.4s, v4.4s\n"
+                "ldr	q24, [%[c_ptr2]]\n"
+                "fmul	v20.4s, v20.4s, v4.4s\n"
+                "ldr	q25, [%[c_ptr2], #16]\n"
+                "fmul	v21.4s, v21.4s, v4.4s\n"
+                "ldr	q26, [%[c_ptr2], #32]\n"
+                "fmul	v22.4s, v22.4s, v4.4s\n"
+                "ldr	q27, [%[c_ptr2], #48]\n"
+                "fmul	v23.4s, v23.4s, v4.4s\n"
 
-                "ldr    q28, [%[c_ptr3]]\n"
-                "fmul    v24.4s, v24.4s, v4.4s\n"
-                "ldr    q29, [%[c_ptr3], #16]\n"
-                "fmul    v25.4s, v25.4s, v4.4s\n"
-                "ldr    q30, [%[c_ptr3], #32]\n"
-                "fmul    v26.4s, v26.4s, v4.4s\n"
-                "ldr    q31, [%[c_ptr3], #48]\n"
-                "fmul    v27.4s, v27.4s, v4.4s\n"
+                "ldr	q28, [%[c_ptr3]]\n"
+                "fmul	v24.4s, v24.4s, v4.4s\n"
+                "ldr	q29, [%[c_ptr3], #16]\n"
+                "fmul	v25.4s, v25.4s, v4.4s\n"
+                "ldr	q30, [%[c_ptr3], #32]\n"
+                "fmul	v26.4s, v26.4s, v4.4s\n"
+                "ldr	q31, [%[c_ptr3], #48]\n"
+                "fmul	v27.4s, v27.4s, v4.4s\n"
 
-                "fmul    v28.4s, v28.4s, v4.4s\n"
-                "fmul    v29.4s, v29.4s, v4.4s\n"
-                "fmul    v30.4s, v30.4s, v4.4s\n"
-                "fmul    v31.4s, v31.4s, v4.4s\n"
+                "fmul	v28.4s, v28.4s, v4.4s\n"
+                "fmul	v29.4s, v29.4s, v4.4s\n"
+                "fmul	v30.4s, v30.4s, v4.4s\n"
+                "fmul	v31.4s, v31.4s, v4.4s\n"
 
-                "cbz    %w[loops], 4f\n"
+                "cbz	%w[loops], 4f\n"
 
                 "1:\n"
                 // Unroll 0
-                "fmla    v16.4s, bb0.4s, a0.s[0]\n"
-                "fmla    v20.4s, bb0.4s, a1.s[0]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
-                "fmla    v24.4s, bb0.4s, a2.s[0]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v28.4s, bb0.4s, a3.s[0]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla	v16.4s, bb0.4s, a0.s[0]\n"
+                "fmla	v20.4s, bb0.4s, a1.s[0]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
+                "fmla	v24.4s, bb0.4s, a2.s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v28.4s, bb0.4s, a3.s[0]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, bb1.4s, a0.s[0]\n"
-                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
-                "ldr    a0aq, [%[a_ptr0], #16]\n"
-                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
-                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, bb1.4s, a0.s[0]\n"
+                "fmla	v21.4s, bb1.4s, a1.s[0]\n"
+                "ldr	a0aq, [%[a_ptr0], #16]\n"
+                "fmla	v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla	v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
-                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
-                "ldr    a1aq, [%[a_ptr1], #16]\n"
-                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
-                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla	v22.4s, bb2.4s, a1.s[0]\n"
+                "ldr	a1aq, [%[a_ptr1], #16]\n"
+                "fmla	v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla	v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
-                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
-                "ldr    a2aq, [%[a_ptr2], #16]\n"
-                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
-                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla	v23.4s, bb3.4s, a1.s[0]\n"
+                "ldr	a2aq, [%[a_ptr2], #16]\n"
+                "fmla	v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla	v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 1
-                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
-                "ldr    a3aq, [%[a_ptr3], #16]\n"
-                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
-                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0.s[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v20.4s, b0a.4s, a1.s[1]\n"
+                "ldr	a3aq, [%[a_ptr3], #16]\n"
+                "fmla	v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla	v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
-                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
-                "subs    %w[loops], %w[loops], #1\n"
-                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
-                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla	v21.4s, b1a.4s, a1.s[1]\n"
+                "subs	%w[loops], %w[loops], #1\n"
+                "fmla	v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla	v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
-                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
-                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
-                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla	v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla	v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla	v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
-                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
-                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
-                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla	v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla	v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla	v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 2
                 "fmla	v16.4s, bb0.4s, a0.s[2]\n"
@@ -273,173 +272,173 @@
                 "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 3
-                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
-                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
-                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
-                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
-                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
-                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+                "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+                "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
-                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
-                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
-                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
-                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
-                "ldr    a0q, [%[a_ptr0]]\n"
-                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
-                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, b3a.4s, a0.s[3]\n"
+                "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+                "ldr	a0q, [%[a_ptr0]]\n"
+                "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+                "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 4
-                "fmla    v16.4s, bb0.4s, a0a.s[0]\n"
-                "fmla    v20.4s, bb0.4s, a1a.s[0]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, bb0.4s, a2a.s[0]\n"
-                "fmla    v28.4s, bb0.4s, a3a.s[0]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla	v16.4s, bb0.4s, a0a.s[0]\n"
+                "fmla	v20.4s, bb0.4s, a1a.s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, bb0.4s, a2a.s[0]\n"
+                "fmla	v28.4s, bb0.4s, a3a.s[0]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, bb1.4s, a0a.s[0]\n"
-                "fmla    v21.4s, bb1.4s, a1a.s[0]\n"
-                "ldr    a1q, [%[a_ptr1]]\n"
-                "fmla    v25.4s, bb1.4s, a2a.s[0]\n"
-                "fmla    v29.4s, bb1.4s, a3a.s[0]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, bb1.4s, a0a.s[0]\n"
+                "fmla	v21.4s, bb1.4s, a1a.s[0]\n"
+                "ldr	a1q, [%[a_ptr1]]\n"
+                "fmla	v25.4s, bb1.4s, a2a.s[0]\n"
+                "fmla	v29.4s, bb1.4s, a3a.s[0]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0a.s[0]\n"
-                "fmla    v22.4s, bb2.4s, a1a.s[0]\n"
-                "ldr    a2q, [%[a_ptr2]]\n"
-                "fmla    v26.4s, bb2.4s, a2a.s[0]\n"
-                "fmla    v30.4s, bb2.4s, a3a.s[0]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0a.s[0]\n"
+                "fmla	v22.4s, bb2.4s, a1a.s[0]\n"
+                "ldr	a2q, [%[a_ptr2]]\n"
+                "fmla	v26.4s, bb2.4s, a2a.s[0]\n"
+                "fmla	v30.4s, bb2.4s, a3a.s[0]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0a.s[0]\n"
-                "fmla    v23.4s, bb3.4s, a1a.s[0]\n"
-                "ldr    a3q, [%[a_ptr3]]\n"
-                "fmla    v27.4s, bb3.4s, a2a.s[0]\n"
-                "fmla    v31.4s, bb3.4s, a3a.s[0]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0a.s[0]\n"
+                "fmla	v23.4s, bb3.4s, a1a.s[0]\n"
+                "ldr	a3q, [%[a_ptr3]]\n"
+                "fmla	v27.4s, bb3.4s, a2a.s[0]\n"
+                "fmla	v31.4s, bb3.4s, a3a.s[0]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 5
-                "fmla    v16.4s, b0a.4s, a0a.s[1]\n"
-                "fmla    v20.4s, b0a.4s, a1a.s[1]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, b0a.4s, a2a.s[1]\n"
-                "fmla    v28.4s, b0a.4s, a3a.s[1]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0a.s[1]\n"
+                "fmla	v20.4s, b0a.4s, a1a.s[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, b0a.4s, a2a.s[1]\n"
+                "fmla	v28.4s, b0a.4s, a3a.s[1]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0a.s[1]\n"
-                "fmla    v21.4s, b1a.4s, a1a.s[1]\n"
-                "fmla    v25.4s, b1a.4s, a2a.s[1]\n"
-                "fmla    v29.4s, b1a.4s, a3a.s[1]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0a.s[1]\n"
+                "fmla	v21.4s, b1a.4s, a1a.s[1]\n"
+                "fmla	v25.4s, b1a.4s, a2a.s[1]\n"
+                "fmla	v29.4s, b1a.4s, a3a.s[1]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0a.s[1]\n"
-                "fmla    v22.4s, b2a.4s, a1a.s[1]\n"
-                "fmla    v26.4s, b2a.4s, a2a.s[1]\n"
-                "fmla    v30.4s, b2a.4s, a3a.s[1]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0a.s[1]\n"
+                "fmla	v22.4s, b2a.4s, a1a.s[1]\n"
+                "fmla	v26.4s, b2a.4s, a2a.s[1]\n"
+                "fmla	v30.4s, b2a.4s, a3a.s[1]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0a.s[1]\n"
-                "fmla    v23.4s, b3a.4s, a1a.s[1]\n"
-                "fmla    v27.4s, b3a.4s, a2a.s[1]\n"
-                "fmla    v31.4s, b3a.4s, a3a.s[1]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, b3a.4s, a0a.s[1]\n"
+                "fmla	v23.4s, b3a.4s, a1a.s[1]\n"
+                "fmla	v27.4s, b3a.4s, a2a.s[1]\n"
+                "fmla	v31.4s, b3a.4s, a3a.s[1]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 6
-                "fmla    v16.4s, bb0.4s, a0a.s[2]\n"
-                "fmla    v20.4s, bb0.4s, a1a.s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, bb0.4s, a2a.s[2]\n"
-                "fmla    v28.4s, bb0.4s, a3a.s[2]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla	v16.4s, bb0.4s, a0a.s[2]\n"
+                "fmla	v20.4s, bb0.4s, a1a.s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, bb0.4s, a2a.s[2]\n"
+                "fmla	v28.4s, bb0.4s, a3a.s[2]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, bb1.4s, a0a.s[2]\n"
-                "fmla    v21.4s, bb1.4s, a1a.s[2]\n"
-                "fmla    v25.4s, bb1.4s, a2a.s[2]\n"
-                "fmla    v29.4s, bb1.4s, a3a.s[2]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, bb1.4s, a0a.s[2]\n"
+                "fmla	v21.4s, bb1.4s, a1a.s[2]\n"
+                "fmla	v25.4s, bb1.4s, a2a.s[2]\n"
+                "fmla	v29.4s, bb1.4s, a3a.s[2]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0a.s[2]\n"
-                "fmla    v22.4s, bb2.4s, a1a.s[2]\n"
-                "fmla    v26.4s, bb2.4s, a2a.s[2]\n"
-                "fmla    v30.4s, bb2.4s, a3a.s[2]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0a.s[2]\n"
+                "fmla	v22.4s, bb2.4s, a1a.s[2]\n"
+                "fmla	v26.4s, bb2.4s, a2a.s[2]\n"
+                "fmla	v30.4s, bb2.4s, a3a.s[2]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0a.s[2]\n"
-                "fmla    v23.4s, bb3.4s, a1a.s[2]\n"
-                "fmla    v27.4s, bb3.4s, a2a.s[2]\n"
-                "fmla    v31.4s, bb3.4s, a3a.s[2]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0a.s[2]\n"
+                "fmla	v23.4s, bb3.4s, a1a.s[2]\n"
+                "fmla	v27.4s, bb3.4s, a2a.s[2]\n"
+                "fmla	v31.4s, bb3.4s, a3a.s[2]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 7
-                "fmla    v16.4s, b0a.4s, a0a.s[3]\n"
-                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
-                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0a.s[3]\n"
+                "fmla	v20.4s, b0a.4s, a1a.s[3]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, b0a.4s, a2a.s[3]\n"
+                "fmla	v28.4s, b0a.4s, a3a.s[3]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0a.s[3]\n"
-                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
-                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
-                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0a.s[3]\n"
+                "fmla	v21.4s, b1a.4s, a1a.s[3]\n"
+                "fmla	v25.4s, b1a.4s, a2a.s[3]\n"
+                "fmla	v29.4s, b1a.4s, a3a.s[3]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0a.s[3]\n"
-                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
-                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
-                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0a.s[3]\n"
+                "fmla	v22.4s, b2a.4s, a1a.s[3]\n"
+                "fmla	v26.4s, b2a.4s, a2a.s[3]\n"
+                "fmla	v30.4s, b2a.4s, a3a.s[3]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0a.s[3]\n"
-                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
-                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
-                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
-                "bne    1b\n"
+                "fmla	v19.4s, b3a.4s, a0a.s[3]\n"
+                "fmla	v23.4s, b3a.4s, a1a.s[3]\n"
+                "fmla	v27.4s, b3a.4s, a2a.s[3]\n"
+                "fmla	v31.4s, b3a.4s, a3a.s[3]\n"
+                "bne	1b\n"
 
                 // Skip to here
                 "4:\n"
 
                 // Detached final iteration
                 // Unroll 0
-                "fmla    v16.4s, bb0.4s, a0.s[0]\n"
-                "fmla    v20.4s, bb0.4s, a1.s[0]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
-                "fmla    v24.4s, bb0.4s, a2.s[0]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v28.4s, bb0.4s, a3.s[0]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla	v16.4s, bb0.4s, a0.s[0]\n"
+                "fmla	v20.4s, bb0.4s, a1.s[0]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
+                "fmla	v24.4s, bb0.4s, a2.s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v28.4s, bb0.4s, a3.s[0]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, bb1.4s, a0.s[0]\n"
-                "cbnz    %w[oddk], 2f\n" // Deal with odd K before we load a0a
-                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
-                "ldr    a0aq, [%[a_ptr0], #16]\n"
-                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
-                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, bb1.4s, a0.s[0]\n"
+                "cbnz	%w[oddk], 2f\n" // Deal with odd K before we load a0a
+                "fmla	v21.4s, bb1.4s, a1.s[0]\n"
+                "ldr	a0aq, [%[a_ptr0], #16]\n"
+                "fmla	v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla	v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
-                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
-                "ldr    a1aq, [%[a_ptr1], #16]\n"
-                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
-                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla	v22.4s, bb2.4s, a1.s[0]\n"
+                "ldr	a1aq, [%[a_ptr1], #16]\n"
+                "fmla	v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla	v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
-                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
-                "ldr    a2aq, [%[a_ptr2], #16]\n"
-                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
-                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla	v23.4s, bb3.4s, a1.s[0]\n"
+                "ldr	a2aq, [%[a_ptr2], #16]\n"
+                "fmla	v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla	v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 1
                 "fmla	v16.4s, b0a.4s, a0.s[1]\n"
@@ -473,394 +472,394 @@
                 "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 2
-                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
-                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
-                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla	v16.4s, bb0.4s, a0.s[2]\n"
+                "fmla	v20.4s, bb0.4s, a1.s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla	v28.4s, bb0.4s, a3.s[2]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
-                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
-                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
-                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, bb1.4s, a0.s[2]\n"
+                "fmla	v21.4s, bb1.4s, a1.s[2]\n"
+                "fmla	v25.4s, bb1.4s, a2.s[2]\n"
+                "fmla	v29.4s, bb1.4s, a3.s[2]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
-                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
-                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
-                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0.s[2]\n"
+                "fmla	v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla	v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla	v30.4s, bb2.4s, a3.s[2]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
-                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
-                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
-                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla	v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla	v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla	v31.4s, bb3.4s, a3.s[2]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 3
-                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
-                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
-                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
-                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
-                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
-                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+                "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+                "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
-                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
-                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
-                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
-                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
-                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
-                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, b3a.4s, a0.s[3]\n"
+                "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+                "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+                "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 4
-                "fmla    v16.4s, bb0.4s, a0a.s[0]\n"
-                "fmla    v20.4s, bb0.4s, a1a.s[0]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, bb0.4s, a2a.s[0]\n"
-                "fmla    v28.4s, bb0.4s, a3a.s[0]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
+                "fmla	v16.4s, bb0.4s, a0a.s[0]\n"
+                "fmla	v20.4s, bb0.4s, a1a.s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, bb0.4s, a2a.s[0]\n"
+                "fmla	v28.4s, bb0.4s, a3a.s[0]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, bb1.4s, a0a.s[0]\n"
-                "fmla    v21.4s, bb1.4s, a1a.s[0]\n"
-                "fmla    v25.4s, bb1.4s, a2a.s[0]\n"
-                "fmla    v29.4s, bb1.4s, a3a.s[0]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, bb1.4s, a0a.s[0]\n"
+                "fmla	v21.4s, bb1.4s, a1a.s[0]\n"
+                "fmla	v25.4s, bb1.4s, a2a.s[0]\n"
+                "fmla	v29.4s, bb1.4s, a3a.s[0]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0a.s[0]\n"
-                "fmla    v22.4s, bb2.4s, a1a.s[0]\n"
-                "fmla    v26.4s, bb2.4s, a2a.s[0]\n"
-                "fmla    v30.4s, bb2.4s, a3a.s[0]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0a.s[0]\n"
+                "fmla	v22.4s, bb2.4s, a1a.s[0]\n"
+                "fmla	v26.4s, bb2.4s, a2a.s[0]\n"
+                "fmla	v30.4s, bb2.4s, a3a.s[0]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0a.s[0]\n"
-                "fmla    v23.4s, bb3.4s, a1a.s[0]\n"
-                "fmla    v27.4s, bb3.4s, a2a.s[0]\n"
-                "fmla    v31.4s, bb3.4s, a3a.s[0]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0a.s[0]\n"
+                "fmla	v23.4s, bb3.4s, a1a.s[0]\n"
+                "fmla	v27.4s, bb3.4s, a2a.s[0]\n"
+                "fmla	v31.4s, bb3.4s, a3a.s[0]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 5
-                "fmla    v16.4s, b0a.4s, a0a.s[1]\n"
-                "fmla    v20.4s, b0a.4s, a1a.s[1]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, b0a.4s, a2a.s[1]\n"
-                "fmla    v28.4s, b0a.4s, a3a.s[1]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0a.s[1]\n"
+                "fmla	v20.4s, b0a.4s, a1a.s[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, b0a.4s, a2a.s[1]\n"
+                "fmla	v28.4s, b0a.4s, a3a.s[1]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0a.s[1]\n"
-                "fmla    v21.4s, b1a.4s, a1a.s[1]\n"
-                "fmla    v25.4s, b1a.4s, a2a.s[1]\n"
-                "fmla    v29.4s, b1a.4s, a3a.s[1]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0a.s[1]\n"
+                "fmla	v21.4s, b1a.4s, a1a.s[1]\n"
+                "fmla	v25.4s, b1a.4s, a2a.s[1]\n"
+                "fmla	v29.4s, b1a.4s, a3a.s[1]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0a.s[1]\n"
-                "fmla    v22.4s, b2a.4s, a1a.s[1]\n"
-                "fmla    v26.4s, b2a.4s, a2a.s[1]\n"
-                "fmla    v30.4s, b2a.4s, a3a.s[1]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0a.s[1]\n"
+                "fmla	v22.4s, b2a.4s, a1a.s[1]\n"
+                "fmla	v26.4s, b2a.4s, a2a.s[1]\n"
+                "fmla	v30.4s, b2a.4s, a3a.s[1]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0a.s[1]\n"
-                "fmla    v23.4s, b3a.4s, a1a.s[1]\n"
-                "fmla    v27.4s, b3a.4s, a2a.s[1]\n"
-                "fmla    v31.4s, b3a.4s, a3a.s[1]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, b3a.4s, a0a.s[1]\n"
+                "fmla	v23.4s, b3a.4s, a1a.s[1]\n"
+                "fmla	v27.4s, b3a.4s, a2a.s[1]\n"
+                "fmla	v31.4s, b3a.4s, a3a.s[1]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 6
-                "fmla    v16.4s, bb0.4s, a0a.s[2]\n"
-                "fmla    v20.4s, bb0.4s, a1a.s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v24.4s, bb0.4s, a2a.s[2]\n"
-                "fmla    v28.4s, bb0.4s, a3a.s[2]\n"
+                "fmla	v16.4s, bb0.4s, a0a.s[2]\n"
+                "fmla	v20.4s, bb0.4s, a1a.s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v24.4s, bb0.4s, a2a.s[2]\n"
+                "fmla	v28.4s, bb0.4s, a3a.s[2]\n"
 
-                "fmla    v17.4s, bb1.4s, a0a.s[2]\n"
-                "fmla    v21.4s, bb1.4s, a1a.s[2]\n"
-                "fmla    v25.4s, bb1.4s, a2a.s[2]\n"
-                "fmla    v29.4s, bb1.4s, a3a.s[2]\n"
+                "fmla	v17.4s, bb1.4s, a0a.s[2]\n"
+                "fmla	v21.4s, bb1.4s, a1a.s[2]\n"
+                "fmla	v25.4s, bb1.4s, a2a.s[2]\n"
+                "fmla	v29.4s, bb1.4s, a3a.s[2]\n"
 
-                "fmla    v18.4s, bb2.4s, a0a.s[2]\n"
-                "fmla    v22.4s, bb2.4s, a1a.s[2]\n"
-                "fmla    v26.4s, bb2.4s, a2a.s[2]\n"
-                "fmla    v30.4s, bb2.4s, a3a.s[2]\n"
+                "fmla	v18.4s, bb2.4s, a0a.s[2]\n"
+                "fmla	v22.4s, bb2.4s, a1a.s[2]\n"
+                "fmla	v26.4s, bb2.4s, a2a.s[2]\n"
+                "fmla	v30.4s, bb2.4s, a3a.s[2]\n"
 
-                "fmla    v19.4s, bb3.4s, a0a.s[2]\n"
-                "fmla    v23.4s, bb3.4s, a1a.s[2]\n"
-                "fmla    v27.4s, bb3.4s, a2a.s[2]\n"
-                "fmla    v31.4s, bb3.4s, a3a.s[2]\n"
+                "fmla	v19.4s, bb3.4s, a0a.s[2]\n"
+                "fmla	v23.4s, bb3.4s, a1a.s[2]\n"
+                "fmla	v27.4s, bb3.4s, a2a.s[2]\n"
+                "fmla	v31.4s, bb3.4s, a3a.s[2]\n"
 
                 // Unroll 7
-                "fmla    v16.4s, b0a.4s, a0a.s[3]\n"
-                "fmla    v17.4s, b1a.4s, a0a.s[3]\n"
-                "fmla    v18.4s, b2a.4s, a0a.s[3]\n"
-                "fmla    v19.4s, b3a.4s, a0a.s[3]\n"
-                "cbnz    %w[odds], 6f\n"
+                "fmla	v16.4s, b0a.4s, a0a.s[3]\n"
+                "fmla	v17.4s, b1a.4s, a0a.s[3]\n"
+                "fmla	v18.4s, b2a.4s, a0a.s[3]\n"
+                "fmla	v19.4s, b3a.4s, a0a.s[3]\n"
+                "cbnz	%w[odds], 6f\n"
 
-                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
-                "str    q16, [%[c_ptr0]]\n"
-                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
-                "str    q17, [%[c_ptr0], #16]\n"
-                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
-                "str    q18, [%[c_ptr0], #32]\n"
-                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
-                "str    q19, [%[c_ptr0], #48]\n"
+                "fmla	v20.4s, b0a.4s, a1a.s[3]\n"
+                "str	q16, [%[c_ptr0]]\n"
+                "fmla	v21.4s, b1a.4s, a1a.s[3]\n"
+                "str	q17, [%[c_ptr0], #16]\n"
+                "fmla	v22.4s, b2a.4s, a1a.s[3]\n"
+                "str	q18, [%[c_ptr0], #32]\n"
+                "fmla	v23.4s, b3a.4s, a1a.s[3]\n"
+                "str	q19, [%[c_ptr0], #48]\n"
 
-                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
-                "str    q20, [%[c_ptr1]]\n"
-                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
-                "str    q21, [%[c_ptr1], #16]\n"
-                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
-                "str    q22, [%[c_ptr1], #32]\n"
-                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
-                "str    q23, [%[c_ptr1], #48]\n"
+                "fmla	v24.4s, b0a.4s, a2a.s[3]\n"
+                "str	q20, [%[c_ptr1]]\n"
+                "fmla	v25.4s, b1a.4s, a2a.s[3]\n"
+                "str	q21, [%[c_ptr1], #16]\n"
+                "fmla	v26.4s, b2a.4s, a2a.s[3]\n"
+                "str	q22, [%[c_ptr1], #32]\n"
+                "fmla	v27.4s, b3a.4s, a2a.s[3]\n"
+                "str	q23, [%[c_ptr1], #48]\n"
 
-                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
-                "str    q24, [%[c_ptr2]]\n"
-                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
-                "str    q25, [%[c_ptr2], #16]\n"
-                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
-                "str    q26, [%[c_ptr2], #32]\n"
-                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
-                "str    q27, [%[c_ptr2], #48]\n"
-                "b    3f\n"
+                "fmla	v28.4s, b0a.4s, a3a.s[3]\n"
+                "str	q24, [%[c_ptr2]]\n"
+                "fmla	v29.4s, b1a.4s, a3a.s[3]\n"
+                "str	q25, [%[c_ptr2], #16]\n"
+                "fmla	v30.4s, b2a.4s, a3a.s[3]\n"
+                "str	q26, [%[c_ptr2], #32]\n"
+                "fmla	v31.4s, b3a.4s, a3a.s[3]\n"
+                "str	q27, [%[c_ptr2], #48]\n"
+                "b	3f\n"
 
                 // Odd K case: Just do 4 more.
                 "2:\n"
-                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
-                "add    %[a_ptr0], %[a_ptr0], #16\n"
-                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
-                "add    %[a_ptr1], %[a_ptr1], #16\n"
-                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v21.4s, bb1.4s, a1.s[0]\n"
+                "add	%[a_ptr0], %[a_ptr0], #16\n"
+                "fmla	v25.4s, bb1.4s, a2.s[0]\n"
+                "add	%[a_ptr1], %[a_ptr1], #16\n"
+                "fmla	v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
-                "add    %[a_ptr2], %[a_ptr2], #16\n"
-                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
-                "add    %[a_ptr3], %[a_ptr3], #16\n"
-                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
-                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, bb2.4s, a0.s[0]\n"
+                "add	%[a_ptr2], %[a_ptr2], #16\n"
+                "fmla	v22.4s, bb2.4s, a1.s[0]\n"
+                "add	%[a_ptr3], %[a_ptr3], #16\n"
+                "fmla	v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla	v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
-                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
-                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
-                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla	v23.4s, bb3.4s, a1.s[0]\n"
+                "fmla	v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla	v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
                 // Unroll 1
-                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
-                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
-                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
-                "ldr    b0aq, [%[b_ptr]]\n"
+                "fmla	v16.4s, b0a.4s, a0.s[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v20.4s, b0a.4s, a1.s[1]\n"
+                "fmla	v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla	v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr	b0aq, [%[b_ptr]]\n"
 
-                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
-                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
-                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
-                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
-                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "fmla	v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla	v21.4s, b1a.4s, a1.s[1]\n"
+                "fmla	v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla	v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr	b1aq, [%[b_ptr], #16]\n"
 
-                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
-                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
-                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
-                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
-                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "fmla	v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla	v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla	v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla	v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr	b2aq, [%[b_ptr], #32]\n"
 
-                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
-                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
-                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
-                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
-                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla	v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla	v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla	v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla	v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr	b3aq, [%[b_ptr], #48]\n"
 
                 // Unroll 2
-                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
-                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
-                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+                "fmla	v16.4s, bb0.4s, a0.s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v20.4s, bb0.4s, a1.s[2]\n"
+                "fmla	v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla	v28.4s, bb0.4s, a3.s[2]\n"
 
-                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
-                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
-                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
-                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+                "fmla	v17.4s, bb1.4s, a0.s[2]\n"
+                "fmla	v21.4s, bb1.4s, a1.s[2]\n"
+                "fmla	v25.4s, bb1.4s, a2.s[2]\n"
+                "fmla	v29.4s, bb1.4s, a3.s[2]\n"
 
-                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
-                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
-                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
-                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+                "fmla	v18.4s, bb2.4s, a0.s[2]\n"
+                "fmla	v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla	v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla	v30.4s, bb2.4s, a3.s[2]\n"
 
-                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
-                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
-                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
-                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+                "fmla	v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla	v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla	v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla	v31.4s, bb3.4s, a3.s[2]\n"
 
                 // Unroll 3
-                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
-                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
-                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
-                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
-                "cbnz    %w[odds], 7f\n"
+                "fmla	v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla	v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla	v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla	v19.4s, b3a.4s, a0.s[3]\n"
+                "cbnz	%w[odds], 7f\n"
 
-                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
-                "str    q16, [%[c_ptr0]]\n"
-                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
-                "str    q17, [%[c_ptr0], #16]\n"
-                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
-                "str    q18, [%[c_ptr0], #32]\n"
-                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
-                "str    q19, [%[c_ptr0], #48]\n"
+                "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+                "str	q16, [%[c_ptr0]]\n"
+                "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+                "str	q17, [%[c_ptr0], #16]\n"
+                "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+                "str	q18, [%[c_ptr0], #32]\n"
+                "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+                "str	q19, [%[c_ptr0], #48]\n"
 
-                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
-                "str    q20, [%[c_ptr1]]\n"
-                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
-                "str    q21, [%[c_ptr1], #16]\n"
-                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
-                "str    q22, [%[c_ptr1], #32]\n"
-                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
-                "str    q23, [%[c_ptr1], #48]\n"
+                "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+                "str	q20, [%[c_ptr1]]\n"
+                "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+                "str	q21, [%[c_ptr1], #16]\n"
+                "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+                "str	q22, [%[c_ptr1], #32]\n"
+                "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+                "str	q23, [%[c_ptr1], #48]\n"
 
-                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
-                "str    q24, [%[c_ptr2]]\n"
-                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
-                "str    q25, [%[c_ptr2], #16]\n"
-                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
-                "str    q26, [%[c_ptr2], #32]\n"
-                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
-                "str    q27, [%[c_ptr2], #48]\n"
-                "b    3f\n"
+                "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+                "str	q24, [%[c_ptr2]]\n"
+                "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+                "str	q25, [%[c_ptr2], #16]\n"
+                "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+                "str	q26, [%[c_ptr2], #32]\n"
+                "fmla	v31.4s, b3a.4s, a3.s[3]\n"
+                "str	q27, [%[c_ptr2], #48]\n"
+                "b	3f\n"
 
                 // "Odd ones" - lead in from even
                 "6:\n"
-                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
-                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
-                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
-                "subs    %w[odds], %w[odds], #1\n"
-                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v20.4s, b0a.4s, a1a.s[3]\n"
+                "fmla	v21.4s, b1a.4s, a1a.s[3]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
+                "fmla	v22.4s, b2a.4s, a1a.s[3]\n"
+                "subs	%w[odds], %w[odds], #1\n"
+                "fmla	v23.4s, b3a.4s, a1a.s[3]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
-                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
-                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
-                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v24.4s, b0a.4s, a2a.s[3]\n"
+                "fmla	v25.4s, b1a.4s, a2a.s[3]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
+                "fmla	v26.4s, b2a.4s, a2a.s[3]\n"
+                "fmla	v27.4s, b3a.4s, a2a.s[3]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
-                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
-                "ld1r    {a0.4s}, [%[a_ptr0]], #4\n"
-                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
-                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
-                "ld1r    {a1.4s}, [%[a_ptr1]], #4\n"
-                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
+                "fmla	v28.4s, b0a.4s, a3a.s[3]\n"
+                "ld1r	{a0.4s}, [%[a_ptr0]], #4\n"
+                "fmla	v29.4s, b1a.4s, a3a.s[3]\n"
+                "fmla	v30.4s, b2a.4s, a3a.s[3]\n"
+                "ld1r	{a1.4s}, [%[a_ptr1]], #4\n"
+                "fmla	v31.4s, b3a.4s, a3a.s[3]\n"
 
-                "fmla    v16.4s, bb0.4s, a0.4s\n"
-                "beq    9f\n"
-                "b    8f\n"
+                "fmla	v16.4s, bb0.4s, a0.4s\n"
+                "beq	9f\n"
+                "b	8f\n"
 
                 // "Odd ones" - lead in from odd
                 "7:\n"
-                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
-                "subs    %w[odds], %w[odds], #1\n"
-                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
-                "ldr    b0q, [%[b_ptr]]\n"
-                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
-                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v20.4s, b0a.4s, a1.s[3]\n"
+                "subs	%w[odds], %w[odds], #1\n"
+                "fmla	v21.4s, b1a.4s, a1.s[3]\n"
+                "ldr	b0q, [%[b_ptr]]\n"
+                "fmla	v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla	v23.4s, b3a.4s, a1.s[3]\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
-                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
-                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
-                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
+                "fmla	v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla	v25.4s, b1a.4s, a2.s[3]\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
+                "fmla	v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla	v27.4s, b3a.4s, a2.s[3]\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
 
-                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
-                "ld1r    {a0.4s}, [%[a_ptr0]], #4\n"
-                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
-                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
-                "ld1r    {a1.4s}, [%[a_ptr1]], #4\n"
-                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "fmla	v28.4s, b0a.4s, a3.s[3]\n"
+                "ld1r	{a0.4s}, [%[a_ptr0]], #4\n"
+                "fmla	v29.4s, b1a.4s, a3.s[3]\n"
+                "fmla	v30.4s, b2a.4s, a3.s[3]\n"
+                "ld1r	{a1.4s}, [%[a_ptr1]], #4\n"
+                "fmla	v31.4s, b3a.4s, a3.s[3]\n"
 
-                "fmla    v16.4s, bb0.4s, a0.4s\n"
-                "beq    9f\n"
+                "fmla	v16.4s, bb0.4s, a0.4s\n"
+                "beq	9f\n"
 
                 // "Odd ones" - loop
                 "8:\n"
-                "fmla    v17.4s, bb1.4s, a0.4s\n"
-                "ld1r    {a2.4s}, [%[a_ptr2]], #4\n"
-                "fmla    v18.4s, bb2.4s, a0.4s\n"
-                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
-                "fmla    v19.4s, bb3.4s, a0.4s\n"
-                "ld1r    {a3.4s}, [%[a_ptr3]], #4\n"
+                "fmla	v17.4s, bb1.4s, a0.4s\n"
+                "ld1r	{a2.4s}, [%[a_ptr2]], #4\n"
+                "fmla	v18.4s, bb2.4s, a0.4s\n"
+                "add	%[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla	v19.4s, bb3.4s, a0.4s\n"
+                "ld1r	{a3.4s}, [%[a_ptr3]], #4\n"
 
-                "fmla    v20.4s, bb0.4s, a1.4s\n"
-                "subs    %w[odds], %w[odds], #1\n"
-                "fmla    v21.4s, bb1.4s, a1.4s\n"
-                "ld1r    {a0.4s}, [%[a_ptr0]], #4\n"
-                "fmla    v22.4s, bb2.4s, a1.4s\n"
-                "fmla    v23.4s, bb3.4s, a1.4s\n"
-                "ld1r    {a1.4s}, [%[a_ptr1]], #4\n"
+                "fmla	v20.4s, bb0.4s, a1.4s\n"
+                "subs	%w[odds], %w[odds], #1\n"
+                "fmla	v21.4s, bb1.4s, a1.4s\n"
+                "ld1r	{a0.4s}, [%[a_ptr0]], #4\n"
+                "fmla	v22.4s, bb2.4s, a1.4s\n"
+                "fmla	v23.4s, bb3.4s, a1.4s\n"
+                "ld1r	{a1.4s}, [%[a_ptr1]], #4\n"
 
-                "fmla    v24.4s, bb0.4s, a2.4s\n"
-                "fmla    v28.4s, bb0.4s, a3.4s\n"
-                "ldr    b0q, [%[b_ptr]]\n"
-                "fmla    v25.4s, bb1.4s, a2.4s\n"
-                "fmla    v29.4s, bb1.4s, a3.4s\n"
-                "ldr    b1q, [%[b_ptr], #16]\n"
+                "fmla	v24.4s, bb0.4s, a2.4s\n"
+                "fmla	v28.4s, bb0.4s, a3.4s\n"
+                "ldr	b0q, [%[b_ptr]]\n"
+                "fmla	v25.4s, bb1.4s, a2.4s\n"
+                "fmla	v29.4s, bb1.4s, a3.4s\n"
+                "ldr	b1q, [%[b_ptr], #16]\n"
 
-                "fmla    v26.4s, bb2.4s, a2.4s\n"
-                "fmla    v30.4s, bb2.4s, a3.4s\n"
-                "ldr    b2q, [%[b_ptr], #32]\n"
-                "fmla    v27.4s, bb3.4s, a2.4s\n"
-                "fmla    v31.4s, bb3.4s, a3.4s\n"
-                "ldr    b3q, [%[b_ptr], #48]\n"
-                "fmla    v16.4s, bb0.4s, a0.4s\n"
-                "bne    8b\n"
+                "fmla	v26.4s, bb2.4s, a2.4s\n"
+                "fmla	v30.4s, bb2.4s, a3.4s\n"
+                "ldr	b2q, [%[b_ptr], #32]\n"
+                "fmla	v27.4s, bb3.4s, a2.4s\n"
+                "fmla	v31.4s, bb3.4s, a3.4s\n"
+                "ldr	b3q, [%[b_ptr], #48]\n"
+                "fmla	v16.4s, bb0.4s, a0.4s\n"
+                "bne	8b\n"
 
                 // "Odd ones" - detached final iteration
                 "9:\n"
-                "fmla    v17.4s, bb1.4s, a0.4s\n"
-                "ld1r    {a2.4s}, [%[a_ptr2]], #4\n"
-                "fmla    v18.4s, bb2.4s, a0.4s\n"
-                "fmla    v19.4s, bb3.4s, a0.4s\n"
-                "ld1r    {a3.4s}, [%[a_ptr3]], #4\n"
+                "fmla	v17.4s, bb1.4s, a0.4s\n"
+                "ld1r	{a2.4s}, [%[a_ptr2]], #4\n"
+                "fmla	v18.4s, bb2.4s, a0.4s\n"
+                "fmla	v19.4s, bb3.4s, a0.4s\n"
+                "ld1r	{a3.4s}, [%[a_ptr3]], #4\n"
 
-                "fmla    v20.4s, bb0.4s, a1.4s\n"
-                "str    q16, [%[c_ptr0]]\n"
-                "fmla    v21.4s, bb1.4s, a1.4s\n"
-                "str    q17, [%[c_ptr0], #16]\n"
-                "fmla    v22.4s, bb2.4s, a1.4s\n"
-                "str    q18, [%[c_ptr0], #32]\n"
-                "fmla    v23.4s, bb3.4s, a1.4s\n"
-                "str    q19, [%[c_ptr0], #48]\n"
+                "fmla	v20.4s, bb0.4s, a1.4s\n"
+                "str	q16, [%[c_ptr0]]\n"
+                "fmla	v21.4s, bb1.4s, a1.4s\n"
+                "str	q17, [%[c_ptr0], #16]\n"
+                "fmla	v22.4s, bb2.4s, a1.4s\n"
+                "str	q18, [%[c_ptr0], #32]\n"
+                "fmla	v23.4s, bb3.4s, a1.4s\n"
+                "str	q19, [%[c_ptr0], #48]\n"
 
-                "fmla    v24.4s, bb0.4s, a2.4s\n"
-                "str    q20, [%[c_ptr1]]\n"
-                "fmla    v25.4s, bb1.4s, a2.4s\n"
-                "str    q21, [%[c_ptr1], #16]\n"
-                "fmla    v26.4s, bb2.4s, a2.4s\n"
-                "str    q22, [%[c_ptr1], #32]\n"
-                "fmla    v27.4s, bb3.4s, a2.4s\n"
-                "str    q23, [%[c_ptr1], #48]\n"
+                "fmla	v24.4s, bb0.4s, a2.4s\n"
+                "str	q20, [%[c_ptr1]]\n"
+                "fmla	v25.4s, bb1.4s, a2.4s\n"
+                "str	q21, [%[c_ptr1], #16]\n"
+                "fmla	v26.4s, bb2.4s, a2.4s\n"
+                "str	q22, [%[c_ptr1], #32]\n"
+                "fmla	v27.4s, bb3.4s, a2.4s\n"
+                "str	q23, [%[c_ptr1], #48]\n"
 
-                "fmla    v28.4s, bb0.4s, a3.4s\n"
-                "str    q24, [%[c_ptr2]]\n"
-                "fmla    v29.4s, bb1.4s, a3.4s\n"
-                "str    q25, [%[c_ptr2], #16]\n"
-                "fmla    v30.4s, bb2.4s, a3.4s\n"
-                "str    q26, [%[c_ptr2], #32]\n"
-                "fmla    v31.4s, bb3.4s, a3.4s\n"
-                "str    q27, [%[c_ptr2], #48]\n"
+                "fmla	v28.4s, bb0.4s, a3.4s\n"
+                "str	q24, [%[c_ptr2]]\n"
+                "fmla	v29.4s, bb1.4s, a3.4s\n"
+                "str	q25, [%[c_ptr2], #16]\n"
+                "fmla	v30.4s, bb2.4s, a3.4s\n"
+                "str	q26, [%[c_ptr2], #32]\n"
+                "fmla	v31.4s, bb3.4s, a3.4s\n"
+                "str	q27, [%[c_ptr2], #48]\n"
 
                 "3:\n"
                 "str	q28, [%[c_ptr3]]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
index c89514f..a73bc76 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,14 +25,13 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_sgemv_pretransposed(const float *, int, const float *, float *, float, int, int);
 
 // Pretransposed SGEMV strategy class.
-class sgemv_pretransposed
-{
+class sgemv_pretransposed {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -47,19 +46,17 @@
      * terms of this standard arrangement, so if the A matrix is in fact the
      * B matrix from a GEMM call, the sense of the transpose needs to be
      * reversed.  */
-    static const int  A_interleave = 32;
-    static const int  A_block      = 1;
-    static const bool A_transpose  = false;
+    static const int A_interleave = 32;
+    static const int A_block = 1;
+    static const bool A_transpose = false;
 
     /* Kernel blocking parameters */
     static const int out_width = 32;
-    static const int k_unroll  = 1;
+    static const int k_unroll = 1;
 
     kern_type kernel = a64_sgemv_pretransposed;
 
-    sgemv_pretransposed(const CPUInfo *ci)
-    {
-    }
+    sgemv_pretransposed(const CPUInfo *ci) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
index 2907598..165e0a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp

@@ -30,15 +30,13 @@
 #include "../../asmlib.hpp"
 #include "../../utils.hpp"
 
-namespace arm_gemm
-{
-void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N)
-{
-    const bool beta0 = (beta == 0.0f);
-    const bool beta1 = (beta == 1.0f);
+namespace arm_gemm {
 
-    for(int x = 0; x < N; x += 32)
-    {
+void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N) {
+    const bool beta0 = (beta==0.0f);
+    const bool beta1 = (beta==1.0f);
+
+    for (int x=0; x<N; x+=32) {
         float *y_ptr = Y + x;
 
         // How many elements are we processing in this loop?
@@ -53,20 +51,16 @@
         register float32x4_t r6 asm("v30");
         register float32x4_t r7 asm("v31");
 
-        register float32x4_t x0 asm("v0");
+        register float32x4_t x0  asm("v0");
         register float32x4_t x0a asm("v1");
 
         const float *x_ptr = X;
-        const float *a_ptr = A + ((x / 32) * lda);
+        const float *a_ptr = A + ((x/32) * lda);
 
-        if(beta0)
-        {
-            r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = vdupq_n_f32(0.0f);
-        }
-        else
-        {
-            if(l == 32)
-            {
+        if (beta0) {
+            r0=r1=r2=r3=r4=r5=r6=r7=vdupq_n_f32(0.0f);
+        } else {
+            if (l==32) {
                 // Fastest path - load all 8 vectors
                 r0 = vld1q_f32(y_ptr);
                 r1 = vld1q_f32(y_ptr + 4);
@@ -76,29 +70,25 @@
                 r5 = vld1q_f32(y_ptr + 20);
                 r6 = vld1q_f32(y_ptr + 24);
                 r7 = vld1q_f32(y_ptr + 28);
-            }
-            else
-            {
+            } else {
                 // Slow case - leftovers.  Note that we don't care about
                 // out-of-range vectors and lanes as we will throw them away at
                 // the end.
-                int vecs    = l / 4; // How many leftover vectors?
-                int oddbits = l % 4; // And how many odd single values?
+                int vecs=l/4; // How many leftover vectors?
+                int oddbits=l%4; // And how many odd single values?
 
-                if(oddbits)
-                {
+                if (oddbits) {
                     // Load the outstanding odd values into a vector first
-                    float32x4_t oddvec  = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that.
-                    float      *oddbase = y_ptr + l - oddbits;
+                    float32x4_t oddvec = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that.
+                    float *oddbase = y_ptr + l - oddbits;
 
-                    switch(oddbits)
-                    {
+                    switch (oddbits) {
                         case 3:
                             oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2);
-                        // fall through
+                            // fall through
                         case 2:
                             oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1);
-                        // fall through
+                            // fall through
                         case 1:
                             oddvec = vld1q_lane_f32(oddbase, oddvec, 0);
                             break;
@@ -108,116 +98,60 @@
                     }
 
                     // Now load the whole vectors, putting the oddments in when we run out.
-                    do
-                    {
-                        if(vecs == 0)
-                        {
-                            r0 = oddvec;
-                            break;
-                        }
+                    do {
+                        if (vecs==0) { r0 = oddvec; break; }
 
                         r0 = vld1q_f32(y_ptr);
-                        if(--vecs == 0)
-                        {
-                            r1 = oddvec;
-                            break;
-                        }
+                        if (--vecs==0) { r1 = oddvec; break; }
 
                         r1 = vld1q_f32(y_ptr + 4);
-                        if(--vecs == 0)
-                        {
-                            r2 = oddvec;
-                            break;
-                        }
+                        if (--vecs==0) { r2 = oddvec; break; }
 
                         r2 = vld1q_f32(y_ptr + 8);
-                        if(--vecs == 0)
-                        {
-                            r3 = oddvec;
-                            break;
-                        }
+                        if (--vecs==0) { r3 = oddvec; break; }
 
                         r3 = vld1q_f32(y_ptr + 12);
-                        if(--vecs == 0)
-                        {
-                            r4 = oddvec;
-                            break;
-                        }
+                        if (--vecs==0) { r4 = oddvec; break; }
 
                         r4 = vld1q_f32(y_ptr + 16);
-                        if(--vecs == 0)
-                        {
-                            r5 = oddvec;
-                            break;
-                        }
+                        if (--vecs==0) { r5 = oddvec; break; }
 
                         r5 = vld1q_f32(y_ptr + 20);
-                        if(--vecs == 0)
-                        {
-                            r6 = oddvec;
-                            break;
-                        }
+                        if (--vecs==0) { r6 = oddvec; break; }
 
                         r6 = vld1q_f32(y_ptr + 24);
                         r7 = oddvec;
-                    }
-                    while(0);
-                }
-                else
-                {
+                    } while (0);
+                } else {
                     // Slightly less slow path - just load the whole vectors
-                    do
-                    {
+                    do {
                         // It can't be the case that oddbits==0 AND vecs==0 or we wouldn't be here.
-                        if(vecs == 0)
-                        {
-                            UNREACHABLE("Impossible lack of work to do");
-                        }
+                        if (vecs==0) { UNREACHABLE("Impossible lack of work to do"); }
 
                         r0 = vld1q_f32(y_ptr);
-                        if(--vecs == 0)
-                        {
-                            break;
-                        }
+                        if (--vecs==0) { break; }
 
                         r1 = vld1q_f32(y_ptr + 4);
-                        if(--vecs == 0)
-                        {
-                            break;
-                        }
+                        if (--vecs==0) { break; }
 
                         r2 = vld1q_f32(y_ptr + 8);
-                        if(--vecs == 0)
-                        {
-                            break;
-                        }
+                        if (--vecs==0) { break; }
 
                         r3 = vld1q_f32(y_ptr + 12);
-                        if(--vecs == 0)
-                        {
-                            break;
-                        }
+                        if (--vecs==0) { break; }
 
                         r4 = vld1q_f32(y_ptr + 16);
-                        if(--vecs == 0)
-                        {
-                            break;
-                        }
+                        if (--vecs==0) { break; }
 
                         r5 = vld1q_f32(y_ptr + 20);
-                        if(--vecs == 0)
-                        {
-                            break;
-                        }
+                        if (--vecs==0) { break; }
 
                         r6 = vld1q_f32(y_ptr + 24);
-                    }
-                    while(0);
+                    } while (0);
                 }
             }
 
-            if(!beta1)
-            {
+            if (!beta1) {
                 const float32x4_t vb = vdupq_n_f32(beta);
 
                 r0 = vmulq_f32(r0, vb);
@@ -231,34 +165,34 @@
             }
         }
 
-        if(M >= 8)
-        {
-            int k = (M / 8) - 1;
-            x0    = vld1q_f32(x_ptr);
+        if (M>=8) {
+            int k = (M/8)-1;
+            x0 = vld1q_f32(x_ptr);
 
-            __asm __volatile(
-                "ldr    q2, [%[a_ptr], #0]\n"
-                "ldr    q3, [%[a_ptr], #16]\n"
-                "ldr    q4, [%[a_ptr], #32]\n"
-                "ldr    q5, [%[a_ptr], #48]\n"
-                "ldr    q6, [%[a_ptr], #64]\n"
-                "ldr    q7, [%[a_ptr], #80]\n"
-                "ldr    q8, [%[a_ptr], #96]\n"
-                "ldr    q9, [%[a_ptr], #112]\n"
-                "ldr    q10, [%[a_ptr], #128]\n"
-                "ldr    q11, [%[a_ptr], #144]\n"
-                "ldr    q12, [%[a_ptr], #160]\n"
-                "ldr    q13, [%[a_ptr], #176]\n"
-                "ldr    q14, [%[a_ptr], #192]\n"
-                "ldr    q15, [%[a_ptr], #208]\n"
-                "ldr    q16, [%[a_ptr], #224]\n"
-                "ldr    q17, [%[a_ptr], #240]\n"
-                "ldr    q18, [%[a_ptr], #256]\n"
-                "ldr    q19, [%[a_ptr], #272]\n"
-                "ldr    q20, [%[a_ptr], #288]\n"
-                "ldr    q21, [%[a_ptr], #304]\n"
-                "ldr    q22, [%[a_ptr], #320]\n"
-                "ldr    q23, [%[a_ptr], #336]\n" ASM_PREFETCH("[%[a_ptr], #384]")
+            __asm __volatile (
+                "ldr	q2, [%[a_ptr], #0]\n"
+                "ldr	q3, [%[a_ptr], #16]\n"
+                "ldr	q4, [%[a_ptr], #32]\n"
+                "ldr	q5, [%[a_ptr], #48]\n"
+                "ldr	q6, [%[a_ptr], #64]\n"
+                "ldr	q7, [%[a_ptr], #80]\n"
+                "ldr	q8, [%[a_ptr], #96]\n"
+                "ldr	q9, [%[a_ptr], #112]\n"
+                "ldr	q10, [%[a_ptr], #128]\n"
+                "ldr	q11, [%[a_ptr], #144]\n"
+                "ldr	q12, [%[a_ptr], #160]\n"
+                "ldr	q13, [%[a_ptr], #176]\n"
+                "ldr	q14, [%[a_ptr], #192]\n"
+                "ldr	q15, [%[a_ptr], #208]\n"
+                "ldr	q16, [%[a_ptr], #224]\n"
+                "ldr	q17, [%[a_ptr], #240]\n"
+                "ldr	q18, [%[a_ptr], #256]\n"
+                "ldr	q19, [%[a_ptr], #272]\n"
+                "ldr	q20, [%[a_ptr], #288]\n"
+                "ldr	q21, [%[a_ptr], #304]\n"
+                "ldr	q22, [%[a_ptr], #320]\n"
+                "ldr	q23, [%[a_ptr], #336]\n"
+                ASM_PREFETCH("[%[a_ptr], #384]")
                 ASM_PREFETCH("[%[a_ptr], #448]")
                 ASM_PREFETCH("[%[a_ptr], #512]")
                 ASM_PREFETCH("[%[a_ptr], #576]")
@@ -284,363 +218,377 @@
                 ASM_PREFETCH("[%[a_ptr], #1856]")
                 ASM_PREFETCH("[%[a_ptr], #1920]")
                 ASM_PREFETCH("[%[a_ptr], #1984]")
-                "add    %[a_ptr], %[a_ptr], #352\n"
+                "add	%[a_ptr], %[a_ptr], #352\n"
 
-                "cbz    %w[k], 2f\n"
+                "cbz	%w[k], 2f\n"
 
                 "1:\n"
                 // Unroll 0
-                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
-                "ldr    %q[x0a], [%[x_ptr], #16]\n"
-                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
-                "ldr    q3, [%[a_ptr], #0]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
-                "ldr    q4, [%[a_ptr], #16]\n"
-                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
-                "ldr    q5, [%[a_ptr], #32]\n"
-                "add    %[x_ptr], %[x_ptr], #32\n" ASM_PREFETCH("[%[a_ptr], #1664]")
-                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
-                "ldr    q6, [%[a_ptr], #48]\n"
-                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
-                "ldr    q7, [%[a_ptr], #64]\n"
-                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
-                "ldr    q8, [%[a_ptr], #80]\n"
-                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
-                "ldr    q9, [%[a_ptr], #96]\n" ASM_PREFETCH("[%[a_ptr], #1728]")
+                "fmla	%[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr	%q[x0a], [%[x_ptr], #16]\n"
+                "fmla	%[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr	q3, [%[a_ptr], #0]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	%[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr	q4, [%[a_ptr], #16]\n"
+                "fmla	%[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr	q5, [%[a_ptr], #32]\n"
+                "add	%[x_ptr], %[x_ptr], #32\n"
+                ASM_PREFETCH("[%[a_ptr], #1664]")
+                "fmla	%[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr	q6, [%[a_ptr], #48]\n"
+                "fmla	%[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr	q7, [%[a_ptr], #64]\n"
+                "fmla	%[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr	q8, [%[a_ptr], #80]\n"
+                "fmla	%[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr	q9, [%[a_ptr], #96]\n"
+                ASM_PREFETCH("[%[a_ptr], #1728]")
 
                 // Unroll 1
-                "fmla    %[r0].4s, v10.4s, %[x0].s[1]\n"
-                "ldr    q10, [%[a_ptr], #112]\n"
-                "fmla    %[r1].4s, v11.4s, %[x0].s[1]\n"
-                "ldr    q11, [%[a_ptr], #128]\n"
-                "fmla    %[r2].4s, v12.4s, %[x0].s[1]\n"
-                "ldr    q12, [%[a_ptr], #144]\n"
-                "fmla    %[r3].4s, v13.4s, %[x0].s[1]\n"
-                "ldr    q13, [%[a_ptr], #160]\n" ASM_PREFETCH("[%[a_ptr], #1792]")
-                "fmla    %[r4].4s, v14.4s, %[x0].s[1]\n"
-                "ldr    q14, [%[a_ptr], #176]\n"
-                "fmla    %[r5].4s, v15.4s, %[x0].s[1]\n"
-                "ldr    q15, [%[a_ptr], #192]\n"
-                "fmla    %[r6].4s, v16.4s, %[x0].s[1]\n"
-                "ldr    q16, [%[a_ptr], #208]\n"
-                "fmla    %[r7].4s, v17.4s, %[x0].s[1]\n"
-                "ldr    q17, [%[a_ptr], #224]\n" ASM_PREFETCH("[%[a_ptr], #1856]")
+                "fmla	%[r0].4s, v10.4s, %[x0].s[1]\n"
+                "ldr	q10, [%[a_ptr], #112]\n"
+                "fmla	%[r1].4s, v11.4s, %[x0].s[1]\n"
+                "ldr	q11, [%[a_ptr], #128]\n"
+                "fmla	%[r2].4s, v12.4s, %[x0].s[1]\n"
+                "ldr	q12, [%[a_ptr], #144]\n"
+                "fmla	%[r3].4s, v13.4s, %[x0].s[1]\n"
+                "ldr	q13, [%[a_ptr], #160]\n"
+                ASM_PREFETCH("[%[a_ptr], #1792]")
+                "fmla	%[r4].4s, v14.4s, %[x0].s[1]\n"
+                "ldr	q14, [%[a_ptr], #176]\n"
+                "fmla	%[r5].4s, v15.4s, %[x0].s[1]\n"
+                "ldr	q15, [%[a_ptr], #192]\n"
+                "fmla	%[r6].4s, v16.4s, %[x0].s[1]\n"
+                "ldr	q16, [%[a_ptr], #208]\n"
+                "fmla	%[r7].4s, v17.4s, %[x0].s[1]\n"
+                "ldr	q17, [%[a_ptr], #224]\n"
+                ASM_PREFETCH("[%[a_ptr], #1856]")
 
                 // Unroll 2
-                "fmla    %[r0].4s, v18.4s, %[x0].s[2]\n"
-                "ldr    q18, [%[a_ptr], #240]\n"
-                "fmla    %[r1].4s, v19.4s, %[x0].s[2]\n"
-                "ldr    q19, [%[a_ptr], #256]\n"
-                "fmla    %[r2].4s, v20.4s, %[x0].s[2]\n"
-                "ldr    q20, [%[a_ptr], #272]\n"
-                "fmla    %[r3].4s, v21.4s, %[x0].s[2]\n"
-                "ldr    q21, [%[a_ptr], #288]\n" ASM_PREFETCH("[%[a_ptr], #1920]")
-                "fmla    %[r4].4s, v22.4s, %[x0].s[2]\n"
-                "ldr    q22, [%[a_ptr], #304]\n"
-                "fmla    %[r5].4s, v23.4s, %[x0].s[2]\n"
-                "ldr    q23, [%[a_ptr], #320]\n"
-                "fmla    %[r6].4s, v3.4s, %[x0].s[2]\n"
-                "ldr    q2, [%[a_ptr], #336]\n"
-                "ldr    q3, [%[a_ptr], #352]\n"
-                "fmla    %[r7].4s, v4.4s, %[x0].s[2]\n"
-                "ldr    q4, [%[a_ptr], #368]\n" ASM_PREFETCH("[%[a_ptr], #1984]")
+                "fmla	%[r0].4s, v18.4s, %[x0].s[2]\n"
+                "ldr	q18, [%[a_ptr], #240]\n"
+                "fmla	%[r1].4s, v19.4s, %[x0].s[2]\n"
+                "ldr	q19, [%[a_ptr], #256]\n"
+                "fmla	%[r2].4s, v20.4s, %[x0].s[2]\n"
+                "ldr	q20, [%[a_ptr], #272]\n"
+                "fmla	%[r3].4s, v21.4s, %[x0].s[2]\n"
+                "ldr	q21, [%[a_ptr], #288]\n"
+                ASM_PREFETCH("[%[a_ptr], #1920]")
+                "fmla	%[r4].4s, v22.4s, %[x0].s[2]\n"
+                "ldr	q22, [%[a_ptr], #304]\n"
+                "fmla	%[r5].4s, v23.4s, %[x0].s[2]\n"
+                "ldr	q23, [%[a_ptr], #320]\n"
+                "fmla	%[r6].4s, v3.4s, %[x0].s[2]\n"
+                "ldr	q2, [%[a_ptr], #336]\n"
+                "ldr	q3, [%[a_ptr], #352]\n"
+                "fmla	%[r7].4s, v4.4s, %[x0].s[2]\n"
+                "ldr	q4, [%[a_ptr], #368]\n"
+                ASM_PREFETCH("[%[a_ptr], #1984]")
 
                 // Unroll 3
-                "fmla    %[r0].4s, v5.4s, %[x0].s[3]\n"
-                "ldr    q5, [%[a_ptr], #384]\n"
-                "fmla    %[r1].4s, v6.4s, %[x0].s[3]\n"
-                "ldr    q6, [%[a_ptr], #400]\n"
-                "fmla    %[r2].4s, v7.4s, %[x0].s[3]\n"
-                "ldr    q7, [%[a_ptr], #416]\n"
-                "fmla    %[r3].4s, v8.4s, %[x0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2048]")
-                "ldr    q8, [%[a_ptr], #432]\n"
-                "fmla    %[r4].4s, v9.4s, %[x0].s[3]\n"
-                "ldr    q9, [%[a_ptr], #448]\n"
-                "fmla    %[r5].4s, v10.4s, %[x0].s[3]\n"
-                "ldr    q10, [%[a_ptr], #464]\n"
-                "fmla    %[r6].4s, v11.4s, %[x0].s[3]\n"
-                "ldr    q11, [%[a_ptr], #480]\n"
-                "fmla    %[r7].4s, v12.4s, %[x0].s[3]\n"
-                "ldr    q12, [%[a_ptr], #496]\n" ASM_PREFETCH("[%[a_ptr], #2112]")
+                "fmla	%[r0].4s, v5.4s, %[x0].s[3]\n"
+                "ldr	q5, [%[a_ptr], #384]\n"
+                "fmla	%[r1].4s, v6.4s, %[x0].s[3]\n"
+                "ldr	q6, [%[a_ptr], #400]\n"
+                "fmla	%[r2].4s, v7.4s, %[x0].s[3]\n"
+                "ldr	q7, [%[a_ptr], #416]\n"
+                "fmla	%[r3].4s, v8.4s, %[x0].s[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #2048]")
+                "ldr	q8, [%[a_ptr], #432]\n"
+                "fmla	%[r4].4s, v9.4s, %[x0].s[3]\n"
+                "ldr	q9, [%[a_ptr], #448]\n"
+                "fmla	%[r5].4s, v10.4s, %[x0].s[3]\n"
+                "ldr	q10, [%[a_ptr], #464]\n"
+                "fmla	%[r6].4s, v11.4s, %[x0].s[3]\n"
+                "ldr	q11, [%[a_ptr], #480]\n"
+                "fmla	%[r7].4s, v12.4s, %[x0].s[3]\n"
+                "ldr	q12, [%[a_ptr], #496]\n"
+                ASM_PREFETCH("[%[a_ptr], #2112]")
 
                 // Unroll 4
-                "fmla    %[r0].4s, v13.4s, %[x0a].s[0]\n"
-                "ldr    %q[x0], [%[x_ptr]]\n"
-                "fmla    %[r1].4s, v14.4s, %[x0a].s[0]\n"
-                "ldr    q14, [%[a_ptr], #512]\n"
-                "fmla    %[r2].4s, v15.4s, %[x0a].s[0]\n"
-                "ldr    q15, [%[a_ptr], #528]\n"
-                "fmla    %[r3].4s, v16.4s, %[x0a].s[0]\n" ASM_PREFETCH("[%[a_ptr], #2176]")
-                "ldr    q16, [%[a_ptr], #544]\n"
-                "fmla    %[r4].4s, v17.4s, %[x0a].s[0]\n"
-                "ldr    q17, [%[a_ptr], #560]\n"
-                "fmla    %[r5].4s, v18.4s, %[x0a].s[0]\n"
-                "ldr    q18, [%[a_ptr], #576]\n"
-                "fmla    %[r6].4s, v19.4s, %[x0a].s[0]\n"
-                "ldr    q19, [%[a_ptr], #592]\n"
-                "fmla    %[r7].4s, v20.4s, %[x0a].s[0]\n"
-                "ldr    q20, [%[a_ptr], #608]\n" ASM_PREFETCH("[%[a_ptr], #2240]")
+                "fmla	%[r0].4s, v13.4s, %[x0a].s[0]\n"
+                "ldr	%q[x0], [%[x_ptr]]\n"
+                "fmla	%[r1].4s, v14.4s, %[x0a].s[0]\n"
+                "ldr	q14, [%[a_ptr], #512]\n"
+                "fmla	%[r2].4s, v15.4s, %[x0a].s[0]\n"
+                "ldr	q15, [%[a_ptr], #528]\n"
+                "fmla	%[r3].4s, v16.4s, %[x0a].s[0]\n"
+                ASM_PREFETCH("[%[a_ptr], #2176]")
+                "ldr	q16, [%[a_ptr], #544]\n"
+                "fmla	%[r4].4s, v17.4s, %[x0a].s[0]\n"
+                "ldr	q17, [%[a_ptr], #560]\n"
+                "fmla	%[r5].4s, v18.4s, %[x0a].s[0]\n"
+                "ldr	q18, [%[a_ptr], #576]\n"
+                "fmla	%[r6].4s, v19.4s, %[x0a].s[0]\n"
+                "ldr	q19, [%[a_ptr], #592]\n"
+                "fmla	%[r7].4s, v20.4s, %[x0a].s[0]\n"
+                "ldr	q20, [%[a_ptr], #608]\n"
+                ASM_PREFETCH("[%[a_ptr], #2240]")
 
                 // Unroll 5
-                "fmla    %[r0].4s, v21.4s, %[x0a].s[1]\n"
-                "ldr    q21, [%[a_ptr], #624]\n"
-                "fmla    %[r1].4s, v22.4s, %[x0a].s[1]\n"
-                "ldr    q22, [%[a_ptr], #640]\n"
-                "fmla    %[r2].4s, v23.4s, %[x0a].s[1]\n"
-                "ldr    q23, [%[a_ptr], #656]\n"
-                "fmla    %[r3].4s, v2.4s, %[x0a].s[1]\n"
-                "ldr    q2, [%[a_ptr], #672]\n" ASM_PREFETCH("[%[a_ptr], #2304]")
-                "fmla    %[r4].4s, v3.4s, %[x0a].s[1]\n"
-                "ldr    q3, [%[a_ptr], #688]\n"
-                "fmla    %[r5].4s, v4.4s, %[x0a].s[1]\n"
-                "ldr    q4, [%[a_ptr], #704]\n"
-                "fmla    %[r6].4s, v5.4s, %[x0a].s[1]\n"
-                "ldr    q5, [%[a_ptr], #720]\n"
-                "fmla    %[r7].4s, v6.4s, %[x0a].s[1]\n"
-                "ldr    q6, [%[a_ptr], #736]\n" ASM_PREFETCH("[%[a_ptr], #2368]")
+                "fmla	%[r0].4s, v21.4s, %[x0a].s[1]\n"
+                "ldr	q21, [%[a_ptr], #624]\n"
+                "fmla	%[r1].4s, v22.4s, %[x0a].s[1]\n"
+                "ldr	q22, [%[a_ptr], #640]\n"
+                "fmla	%[r2].4s, v23.4s, %[x0a].s[1]\n"
+                "ldr	q23, [%[a_ptr], #656]\n"
+                "fmla	%[r3].4s, v2.4s, %[x0a].s[1]\n"
+                "ldr	q2, [%[a_ptr], #672]\n"
+                ASM_PREFETCH("[%[a_ptr], #2304]")
+                "fmla	%[r4].4s, v3.4s, %[x0a].s[1]\n"
+                "ldr	q3, [%[a_ptr], #688]\n"
+                "fmla	%[r5].4s, v4.4s, %[x0a].s[1]\n"
+                "ldr	q4, [%[a_ptr], #704]\n"
+                "fmla	%[r6].4s, v5.4s, %[x0a].s[1]\n"
+                "ldr	q5, [%[a_ptr], #720]\n"
+                "fmla	%[r7].4s, v6.4s, %[x0a].s[1]\n"
+                "ldr	q6, [%[a_ptr], #736]\n"
+                ASM_PREFETCH("[%[a_ptr], #2368]")
 
                 // Unroll 6
-                "fmla    %[r0].4s, v7.4s, %[x0a].s[2]\n"
-                "ldr    q7, [%[a_ptr], #752]\n"
-                "fmla    %[r1].4s, v8.4s, %[x0a].s[2]\n"
-                "ldr    q8, [%[a_ptr], #768]\n"
-                "fmla    %[r2].4s, v9.4s, %[x0a].s[2]\n"
-                "ldr    q9, [%[a_ptr], #784]\n"
-                "fmla    %[r3].4s, v10.4s, %[x0a].s[2]\n"
-                "ldr    q10, [%[a_ptr], #800]\n" ASM_PREFETCH("[%[a_ptr], #2432]")
-                "fmla    %[r4].4s, v11.4s, %[x0a].s[2]\n"
-                "ldr    q11, [%[a_ptr], #816]\n"
-                "fmla    %[r5].4s, v12.4s, %[x0a].s[2]\n"
-                "ldr    q12, [%[a_ptr], #832]\n"
-                "fmla    %[r6].4s, v14.4s, %[x0a].s[2]\n"
-                "ldr    q13, [%[a_ptr], #848]\n"
-                "ldr    q14, [%[a_ptr], #864]\n"
-                "fmla    %[r7].4s, v15.4s, %[x0a].s[2]\n"
-                "ldr    q15, [%[a_ptr], #880]\n" ASM_PREFETCH("[%[a_ptr], #2496]")
+                "fmla	%[r0].4s, v7.4s, %[x0a].s[2]\n"
+                "ldr	q7, [%[a_ptr], #752]\n"
+                "fmla	%[r1].4s, v8.4s, %[x0a].s[2]\n"
+                "ldr	q8, [%[a_ptr], #768]\n"
+                "fmla	%[r2].4s, v9.4s, %[x0a].s[2]\n"
+                "ldr	q9, [%[a_ptr], #784]\n"
+                "fmla	%[r3].4s, v10.4s, %[x0a].s[2]\n"
+                "ldr	q10, [%[a_ptr], #800]\n"
+                ASM_PREFETCH("[%[a_ptr], #2432]")
+                "fmla	%[r4].4s, v11.4s, %[x0a].s[2]\n"
+                "ldr	q11, [%[a_ptr], #816]\n"
+                "fmla	%[r5].4s, v12.4s, %[x0a].s[2]\n"
+                "ldr	q12, [%[a_ptr], #832]\n"
+                "fmla	%[r6].4s, v14.4s, %[x0a].s[2]\n"
+                "ldr	q13, [%[a_ptr], #848]\n"
+                "ldr	q14, [%[a_ptr], #864]\n"
+                "fmla	%[r7].4s, v15.4s, %[x0a].s[2]\n"
+                "ldr	q15, [%[a_ptr], #880]\n"
+                ASM_PREFETCH("[%[a_ptr], #2496]")
 
                 // Unroll 7
-                "fmla    %[r0].4s, v16.4s, %[x0a].s[3]\n"
-                "ldr    q16, [%[a_ptr], #896]\n"
-                "fmla    %[r1].4s, v17.4s, %[x0a].s[3]\n"
-                "ldr    q17, [%[a_ptr], #912]\n"
-                "fmla    %[r2].4s, v18.4s, %[x0a].s[3]\n"
-                "ldr    q18, [%[a_ptr], #928]\n"
-                "fmla    %[r3].4s, v19.4s, %[x0a].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2560]")
-                "ldr    q19, [%[a_ptr], #944]\n"
-                "fmla    %[r4].4s, v20.4s, %[x0a].s[3]\n"
-                "ldr    q20, [%[a_ptr], #960]\n"
-                "fmla    %[r5].4s, v21.4s, %[x0a].s[3]\n"
-                "ldr    q21, [%[a_ptr], #976]\n"
-                "add    %[a_ptr], %[a_ptr], #1024\n"
-                "fmla    %[r6].4s, v22.4s, %[x0a].s[3]\n"
-                "ldr    q22, [%[a_ptr], #-32]\n"
-                "fmla    %[r7].4s, v23.4s, %[x0a].s[3]\n"
-                "ldr    q23, [%[a_ptr], #-16]\n" ASM_PREFETCH("[%[a_ptr], #1600]")
-                "bne    1b\n"
+                "fmla	%[r0].4s, v16.4s, %[x0a].s[3]\n"
+                "ldr	q16, [%[a_ptr], #896]\n"
+                "fmla	%[r1].4s, v17.4s, %[x0a].s[3]\n"
+                "ldr	q17, [%[a_ptr], #912]\n"
+                "fmla	%[r2].4s, v18.4s, %[x0a].s[3]\n"
+                "ldr	q18, [%[a_ptr], #928]\n"
+                "fmla	%[r3].4s, v19.4s, %[x0a].s[3]\n"
+                ASM_PREFETCH("[%[a_ptr], #2560]")
+                "ldr	q19, [%[a_ptr], #944]\n"
+                "fmla	%[r4].4s, v20.4s, %[x0a].s[3]\n"
+                "ldr	q20, [%[a_ptr], #960]\n"
+                "fmla	%[r5].4s, v21.4s, %[x0a].s[3]\n"
+                "ldr	q21, [%[a_ptr], #976]\n"
+                "add	%[a_ptr], %[a_ptr], #1024\n"
+                "fmla	%[r6].4s, v22.4s, %[x0a].s[3]\n"
+                "ldr	q22, [%[a_ptr], #-32]\n"
+                "fmla	%[r7].4s, v23.4s, %[x0a].s[3]\n"
+                "ldr	q23, [%[a_ptr], #-16]\n"
+                ASM_PREFETCH("[%[a_ptr], #1600]")
+                "bne	1b\n"
 
                 // Detached final iteration
                 "2:\n"
 
                 // Unroll 0
-                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
-                "ldr    %q[x0a], [%[x_ptr], #16]\n"
-                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
-                "ldr    q3, [%[a_ptr], #0]\n"
-                "subs    %w[k], %w[k], #1\n"
-                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
-                "ldr    q4, [%[a_ptr], #16]\n"
-                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
-                "ldr    q5, [%[a_ptr], #32]\n"
-                "add    %[x_ptr], %[x_ptr], #32\n"
-                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
-                "ldr    q6, [%[a_ptr], #48]\n"
-                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
-                "ldr    q7, [%[a_ptr], #64]\n"
-                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
-                "ldr    q8, [%[a_ptr], #80]\n"
-                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
-                "ldr    q9, [%[a_ptr], #96]\n"
+                "fmla	%[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr	%q[x0a], [%[x_ptr], #16]\n"
+                "fmla	%[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr	q3, [%[a_ptr], #0]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	%[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr	q4, [%[a_ptr], #16]\n"
+                "fmla	%[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr	q5, [%[a_ptr], #32]\n"
+                "add	%[x_ptr], %[x_ptr], #32\n"
+                "fmla	%[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr	q6, [%[a_ptr], #48]\n"
+                "fmla	%[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr	q7, [%[a_ptr], #64]\n"
+                "fmla	%[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr	q8, [%[a_ptr], #80]\n"
+                "fmla	%[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr	q9, [%[a_ptr], #96]\n"
 
                 // Unroll 1
-                "fmla    %[r0].4s, v10.4s, %[x0].s[1]\n"
-                "ldr    q10, [%[a_ptr], #112]\n"
-                "fmla    %[r1].4s, v11.4s, %[x0].s[1]\n"
-                "ldr    q11, [%[a_ptr], #128]\n"
-                "fmla    %[r2].4s, v12.4s, %[x0].s[1]\n"
-                "ldr    q12, [%[a_ptr], #144]\n"
-                "fmla    %[r3].4s, v13.4s, %[x0].s[1]\n"
-                "ldr    q13, [%[a_ptr], #160]\n"
-                "fmla    %[r4].4s, v14.4s, %[x0].s[1]\n"
-                "ldr    q14, [%[a_ptr], #176]\n"
-                "fmla    %[r5].4s, v15.4s, %[x0].s[1]\n"
-                "ldr    q15, [%[a_ptr], #192]\n"
-                "fmla    %[r6].4s, v16.4s, %[x0].s[1]\n"
-                "ldr    q16, [%[a_ptr], #208]\n"
-                "fmla    %[r7].4s, v17.4s, %[x0].s[1]\n"
-                "ldr    q17, [%[a_ptr], #224]\n"
+                "fmla	%[r0].4s, v10.4s, %[x0].s[1]\n"
+                "ldr	q10, [%[a_ptr], #112]\n"
+                "fmla	%[r1].4s, v11.4s, %[x0].s[1]\n"
+                "ldr	q11, [%[a_ptr], #128]\n"
+                "fmla	%[r2].4s, v12.4s, %[x0].s[1]\n"
+                "ldr	q12, [%[a_ptr], #144]\n"
+                "fmla	%[r3].4s, v13.4s, %[x0].s[1]\n"
+                "ldr	q13, [%[a_ptr], #160]\n"
+                "fmla	%[r4].4s, v14.4s, %[x0].s[1]\n"
+                "ldr	q14, [%[a_ptr], #176]\n"
+                "fmla	%[r5].4s, v15.4s, %[x0].s[1]\n"
+                "ldr	q15, [%[a_ptr], #192]\n"
+                "fmla	%[r6].4s, v16.4s, %[x0].s[1]\n"
+                "ldr	q16, [%[a_ptr], #208]\n"
+                "fmla	%[r7].4s, v17.4s, %[x0].s[1]\n"
+                "ldr	q17, [%[a_ptr], #224]\n"
 
                 // Unroll 2
-                "fmla    %[r0].4s, v18.4s, %[x0].s[2]\n"
-                "ldr    q18, [%[a_ptr], #240]\n"
-                "fmla    %[r1].4s, v19.4s, %[x0].s[2]\n"
-                "ldr    q19, [%[a_ptr], #256]\n"
-                "fmla    %[r2].4s, v20.4s, %[x0].s[2]\n"
-                "ldr    q20, [%[a_ptr], #272]\n"
-                "fmla    %[r3].4s, v21.4s, %[x0].s[2]\n"
-                "ldr    q21, [%[a_ptr], #288]\n"
-                "fmla    %[r4].4s, v22.4s, %[x0].s[2]\n"
-                "ldr    q22, [%[a_ptr], #304]\n"
-                "fmla    %[r5].4s, v23.4s, %[x0].s[2]\n"
-                "ldr    q23, [%[a_ptr], #320]\n"
-                "fmla    %[r6].4s, v3.4s, %[x0].s[2]\n"
-                "ldr    q2, [%[a_ptr], #336]\n"
-                "ldr    q3, [%[a_ptr], #352]\n"
-                "fmla    %[r7].4s, v4.4s, %[x0].s[2]\n"
-                "ldr    q4, [%[a_ptr], #368]\n"
+                "fmla	%[r0].4s, v18.4s, %[x0].s[2]\n"
+                "ldr	q18, [%[a_ptr], #240]\n"
+                "fmla	%[r1].4s, v19.4s, %[x0].s[2]\n"
+                "ldr	q19, [%[a_ptr], #256]\n"
+                "fmla	%[r2].4s, v20.4s, %[x0].s[2]\n"
+                "ldr	q20, [%[a_ptr], #272]\n"
+                "fmla	%[r3].4s, v21.4s, %[x0].s[2]\n"
+                "ldr	q21, [%[a_ptr], #288]\n"
+                "fmla	%[r4].4s, v22.4s, %[x0].s[2]\n"
+                "ldr	q22, [%[a_ptr], #304]\n"
+                "fmla	%[r5].4s, v23.4s, %[x0].s[2]\n"
+                "ldr	q23, [%[a_ptr], #320]\n"
+                "fmla	%[r6].4s, v3.4s, %[x0].s[2]\n"
+                "ldr	q2, [%[a_ptr], #336]\n"
+                "ldr	q3, [%[a_ptr], #352]\n"
+                "fmla	%[r7].4s, v4.4s, %[x0].s[2]\n"
+                "ldr	q4, [%[a_ptr], #368]\n"
 
                 // Unroll 3
-                "fmla    %[r0].4s, v5.4s, %[x0].s[3]\n"
-                "ldr    q5, [%[a_ptr], #384]\n"
-                "fmla    %[r1].4s, v6.4s, %[x0].s[3]\n"
-                "ldr    q6, [%[a_ptr], #400]\n"
-                "fmla    %[r2].4s, v7.4s, %[x0].s[3]\n"
-                "ldr    q7, [%[a_ptr], #416]\n"
-                "fmla    %[r3].4s, v8.4s, %[x0].s[3]\n"
-                "ldr    q8, [%[a_ptr], #432]\n"
-                "fmla    %[r4].4s, v9.4s, %[x0].s[3]\n"
-                "ldr    q9, [%[a_ptr], #448]\n"
-                "fmla    %[r5].4s, v10.4s, %[x0].s[3]\n"
-                "ldr    q10, [%[a_ptr], #464]\n"
-                "fmla    %[r6].4s, v11.4s, %[x0].s[3]\n"
-                "ldr    q11, [%[a_ptr], #480]\n"
-                "fmla    %[r7].4s, v12.4s, %[x0].s[3]\n"
-                "ldr    q12, [%[a_ptr], #496]\n"
+                "fmla	%[r0].4s, v5.4s, %[x0].s[3]\n"
+                "ldr	q5, [%[a_ptr], #384]\n"
+                "fmla	%[r1].4s, v6.4s, %[x0].s[3]\n"
+                "ldr	q6, [%[a_ptr], #400]\n"
+                "fmla	%[r2].4s, v7.4s, %[x0].s[3]\n"
+                "ldr	q7, [%[a_ptr], #416]\n"
+                "fmla	%[r3].4s, v8.4s, %[x0].s[3]\n"
+                "ldr	q8, [%[a_ptr], #432]\n"
+                "fmla	%[r4].4s, v9.4s, %[x0].s[3]\n"
+                "ldr	q9, [%[a_ptr], #448]\n"
+                "fmla	%[r5].4s, v10.4s, %[x0].s[3]\n"
+                "ldr	q10, [%[a_ptr], #464]\n"
+                "fmla	%[r6].4s, v11.4s, %[x0].s[3]\n"
+                "ldr	q11, [%[a_ptr], #480]\n"
+                "fmla	%[r7].4s, v12.4s, %[x0].s[3]\n"
+                "ldr	q12, [%[a_ptr], #496]\n"
 
                 // Unroll 4
-                "fmla    %[r0].4s, v13.4s, %[x0a].s[0]\n"
-                "fmla    %[r1].4s, v14.4s, %[x0a].s[0]\n"
-                "ldr    q14, [%[a_ptr], #512]\n"
-                "fmla    %[r2].4s, v15.4s, %[x0a].s[0]\n"
-                "ldr    q15, [%[a_ptr], #528]\n"
-                "fmla    %[r3].4s, v16.4s, %[x0a].s[0]\n"
-                "ldr    q16, [%[a_ptr], #544]\n"
-                "fmla    %[r4].4s, v17.4s, %[x0a].s[0]\n"
-                "ldr    q17, [%[a_ptr], #560]\n"
-                "fmla    %[r5].4s, v18.4s, %[x0a].s[0]\n"
-                "ldr    q18, [%[a_ptr], #576]\n"
-                "fmla    %[r6].4s, v19.4s, %[x0a].s[0]\n"
-                "ldr    q19, [%[a_ptr], #592]\n"
-                "fmla    %[r7].4s, v20.4s, %[x0a].s[0]\n"
-                "ldr    q20, [%[a_ptr], #608]\n"
+                "fmla	%[r0].4s, v13.4s, %[x0a].s[0]\n"
+                "fmla	%[r1].4s, v14.4s, %[x0a].s[0]\n"
+                "ldr	q14, [%[a_ptr], #512]\n"
+                "fmla	%[r2].4s, v15.4s, %[x0a].s[0]\n"
+                "ldr	q15, [%[a_ptr], #528]\n"
+                "fmla	%[r3].4s, v16.4s, %[x0a].s[0]\n"
+                "ldr	q16, [%[a_ptr], #544]\n"
+                "fmla	%[r4].4s, v17.4s, %[x0a].s[0]\n"
+                "ldr	q17, [%[a_ptr], #560]\n"
+                "fmla	%[r5].4s, v18.4s, %[x0a].s[0]\n"
+                "ldr	q18, [%[a_ptr], #576]\n"
+                "fmla	%[r6].4s, v19.4s, %[x0a].s[0]\n"
+                "ldr	q19, [%[a_ptr], #592]\n"
+                "fmla	%[r7].4s, v20.4s, %[x0a].s[0]\n"
+                "ldr	q20, [%[a_ptr], #608]\n"
 
                 // Unroll 5
-                "fmla    %[r0].4s, v21.4s, %[x0a].s[1]\n"
-                "ldr    q21, [%[a_ptr], #624]\n"
-                "fmla    %[r1].4s, v22.4s, %[x0a].s[1]\n"
-                "ldr    q22, [%[a_ptr], #640]\n"
-                "fmla    %[r2].4s, v23.4s, %[x0a].s[1]\n"
-                "ldr    q23, [%[a_ptr], #656]\n"
-                "fmla    %[r3].4s, v2.4s, %[x0a].s[1]\n"
-                "add    %[a_ptr], %[a_ptr], #672\n"
-                "fmla    %[r4].4s, v3.4s, %[x0a].s[1]\n"
-                "fmla    %[r5].4s, v4.4s, %[x0a].s[1]\n"
-                "fmla    %[r6].4s, v5.4s, %[x0a].s[1]\n"
-                "fmla    %[r7].4s, v6.4s, %[x0a].s[1]\n"
+                "fmla	%[r0].4s, v21.4s, %[x0a].s[1]\n"
+                "ldr	q21, [%[a_ptr], #624]\n"
+                "fmla	%[r1].4s, v22.4s, %[x0a].s[1]\n"
+                "ldr	q22, [%[a_ptr], #640]\n"
+                "fmla	%[r2].4s, v23.4s, %[x0a].s[1]\n"
+                "ldr	q23, [%[a_ptr], #656]\n"
+                "fmla	%[r3].4s, v2.4s, %[x0a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #672\n"
+                "fmla	%[r4].4s, v3.4s, %[x0a].s[1]\n"
+                "fmla	%[r5].4s, v4.4s, %[x0a].s[1]\n"
+                "fmla	%[r6].4s, v5.4s, %[x0a].s[1]\n"
+                "fmla	%[r7].4s, v6.4s, %[x0a].s[1]\n"
 
                 // Unroll 6
-                "fmla    %[r0].4s, v7.4s, %[x0a].s[2]\n"
-                "fmla    %[r1].4s, v8.4s, %[x0a].s[2]\n"
-                "fmla    %[r2].4s, v9.4s, %[x0a].s[2]\n"
-                "fmla    %[r3].4s, v10.4s, %[x0a].s[2]\n"
-                "fmla    %[r4].4s, v11.4s, %[x0a].s[2]\n"
-                "fmla    %[r5].4s, v12.4s, %[x0a].s[2]\n"
-                "fmla    %[r6].4s, v14.4s, %[x0a].s[2]\n"
-                "fmla    %[r7].4s, v15.4s, %[x0a].s[2]\n"
+                "fmla	%[r0].4s, v7.4s, %[x0a].s[2]\n"
+                "fmla	%[r1].4s, v8.4s, %[x0a].s[2]\n"
+                "fmla	%[r2].4s, v9.4s, %[x0a].s[2]\n"
+                "fmla	%[r3].4s, v10.4s, %[x0a].s[2]\n"
+                "fmla	%[r4].4s, v11.4s, %[x0a].s[2]\n"
+                "fmla	%[r5].4s, v12.4s, %[x0a].s[2]\n"
+                "fmla	%[r6].4s, v14.4s, %[x0a].s[2]\n"
+                "fmla	%[r7].4s, v15.4s, %[x0a].s[2]\n"
 
                 // Unroll 7
-                "fmla    %[r0].4s, v16.4s, %[x0a].s[3]\n"
-                "fmla    %[r1].4s, v17.4s, %[x0a].s[3]\n"
-                "fmla    %[r2].4s, v18.4s, %[x0a].s[3]\n"
-                "fmla    %[r3].4s, v19.4s, %[x0a].s[3]\n"
-                "fmla    %[r4].4s, v20.4s, %[x0a].s[3]\n"
-                "fmla    %[r5].4s, v21.4s, %[x0a].s[3]\n"
-                "fmla    %[r6].4s, v22.4s, %[x0a].s[3]\n"
-                "fmla    %[r7].4s, v23.4s, %[x0a].s[3]\n"
-                :
-                [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
-                [x0] "+w"(x0), [x0a] "+w"(x0a), [k] "+r"(k),
-                [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
-                [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
-                :
-                : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
-                "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory");
+                "fmla	%[r0].4s, v16.4s, %[x0a].s[3]\n"
+                "fmla	%[r1].4s, v17.4s, %[x0a].s[3]\n"
+                "fmla	%[r2].4s, v18.4s, %[x0a].s[3]\n"
+                "fmla	%[r3].4s, v19.4s, %[x0a].s[3]\n"
+                "fmla	%[r4].4s, v20.4s, %[x0a].s[3]\n"
+                "fmla	%[r5].4s, v21.4s, %[x0a].s[3]\n"
+                "fmla	%[r6].4s, v22.4s, %[x0a].s[3]\n"
+                "fmla	%[r7].4s, v23.4s, %[x0a].s[3]\n"
+            :
+              [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr),
+              [x0] "+w" (x0), [x0a] "+w" (x0a), [k] "+r" (k),
+              [r0] "+w" (r0), [r1] "+w" (r1), [r2] "+w" (r2), [r3] "+w" (r3),
+              [r4] "+w" (r4), [r5] "+w" (r5), [r6] "+w" (r6), [r7] "+w" (r7)
+            :
+            : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+              "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory");
         }
 
         // Deal with ragged M
-        if(M % 8)
-        {
-            int l = (M % 8) - 1;
+        if (M % 8) {
+            int l=(M%8)-1;
 
-            __asm __volatile(
-                "ldr    q2, [%[a_ptr], #0]\n"
-                "ldr    q3, [%[a_ptr], #16]\n"
-                "ldr    q4, [%[a_ptr], #32]\n"
-                "ldr    q5, [%[a_ptr], #48]\n"
-                "ldr    q6, [%[a_ptr], #64]\n"
-                "ldr    q7, [%[a_ptr], #80]\n"
-                "ldr    q8, [%[a_ptr], #96]\n"
-                "ldr    q9, [%[a_ptr], #112]\n"
-                "ldr    %s[x0], [%[x_ptr]]\n"
-                "add    %[a_ptr], %[a_ptr], #128\n"
-                "add    %[x_ptr], %[x_ptr], #4\n"
+            __asm __volatile (
+                "ldr	q2, [%[a_ptr], #0]\n"
+                "ldr	q3, [%[a_ptr], #16]\n"
+                "ldr	q4, [%[a_ptr], #32]\n"
+                "ldr	q5, [%[a_ptr], #48]\n"
+                "ldr	q6, [%[a_ptr], #64]\n"
+                "ldr	q7, [%[a_ptr], #80]\n"
+                "ldr	q8, [%[a_ptr], #96]\n"
+                "ldr	q9, [%[a_ptr], #112]\n"
+                "ldr	%s[x0], [%[x_ptr]]\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "add	%[x_ptr], %[x_ptr], #4\n"
 
-                "cbz    %w[l], 2f\n"
+                "cbz	%w[l], 2f\n"
 
                 "1:\n"
-                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
-                "ldr    q2, [%[a_ptr], #0]\n"
-                "subs    %w[l], %w[l], #1\n"
-                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
-                "ldr    q3, [%[a_ptr], #16]\n"
-                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
-                "ldr    q4, [%[a_ptr], #32]\n"
-                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
-                "ldr    q5, [%[a_ptr], #48]\n"
-                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
-                "ldr    q6, [%[a_ptr], #64]\n"
-                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
-                "ldr    q7, [%[a_ptr], #80]\n"
-                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
-                "ldr    q8, [%[a_ptr], #96]\n"
-                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
-                "ldr    q9, [%[a_ptr], #112]\n"
-                "ldr    %s[x0], [%[x_ptr]]\n"
-                "add    %[a_ptr], %[a_ptr], #128\n"
-                "add    %[x_ptr], %[x_ptr], #4\n"
-                "bne    1b\n"
+                "fmla	%[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr	q2, [%[a_ptr], #0]\n"
+                "subs	%w[l], %w[l], #1\n"
+                "fmla	%[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr	q3, [%[a_ptr], #16]\n"
+                "fmla	%[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr	q4, [%[a_ptr], #32]\n"
+                "fmla	%[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr	q5, [%[a_ptr], #48]\n"
+                "fmla	%[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr	q6, [%[a_ptr], #64]\n"
+                "fmla	%[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr	q7, [%[a_ptr], #80]\n"
+                "fmla	%[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr	q8, [%[a_ptr], #96]\n"
+                "fmla	%[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr	q9, [%[a_ptr], #112]\n"
+                "ldr	%s[x0], [%[x_ptr]]\n"
+                "add	%[a_ptr], %[a_ptr], #128\n"
+                "add	%[x_ptr], %[x_ptr], #4\n"
+                "bne	1b\n"
 
                 "2:\n"
 
-                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
-                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
-                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
-                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
-                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
-                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
-                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
-                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
-                :
-                [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
-                [x0] "+w"(x0), [l] "+r"(l),
-                [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
-                [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
-                :
-                : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory");
+                "fmla	%[r0].4s, v2.4s, %[x0].s[0]\n"
+                "fmla	%[r1].4s, v3.4s, %[x0].s[0]\n"
+                "fmla	%[r2].4s, v4.4s, %[x0].s[0]\n"
+                "fmla	%[r3].4s, v5.4s, %[x0].s[0]\n"
+                "fmla	%[r4].4s, v6.4s, %[x0].s[0]\n"
+                "fmla	%[r5].4s, v7.4s, %[x0].s[0]\n"
+                "fmla	%[r6].4s, v8.4s, %[x0].s[0]\n"
+                "fmla	%[r7].4s, v9.4s, %[x0].s[0]\n"
+            :
+              [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr),
+              [x0] "+w" (x0), [l] "+r" (l),
+              [r0] "+w" (r0), [r1] "+w" (r1), [r2] "+w" (r2), [r3] "+w" (r3),
+              [r4] "+w" (r4), [r5] "+w" (r5), [r6] "+w" (r6), [r7] "+w" (r7)
+            :
+            : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory");
         }
 
-        if(l == 32)
-        {
+        if (l==32) {
             // Fast path
             vst1q_f32(y_ptr, r0);
             vst1q_f32(y_ptr + 4, r1);
@@ -650,82 +598,48 @@
             vst1q_f32(y_ptr + 20, r5);
             vst1q_f32(y_ptr + 24, r6);
             vst1q_f32(y_ptr + 28, r7);
-        }
-        else
-        {
-            int vecs    = l / 4;
-            int oddbits = l % 4;
+        } else {
+            int vecs=l/4;
+            int oddbits=l%4;
 
-            if(oddbits)
-            {
+            if (oddbits) {
                 // As above - slowest path deals with vectors plus odd bits
                 float32x4_t oddvec;
 
-                do
-                {
-                    if(vecs == 0)
-                    {
-                        oddvec = r0;
-                        break;
-                    }
+                do {
+                    if (vecs==0) { oddvec=r0; break; }
 
                     vst1q_f32(y_ptr, r0);
-                    if(--vecs == 0)
-                    {
-                        oddvec = r1;
-                        break;
-                    }
+                    if (--vecs==0) { oddvec=r1; break; }
 
                     vst1q_f32(y_ptr + 4, r1);
-                    if(--vecs == 0)
-                    {
-                        oddvec = r2;
-                        break;
-                    }
+                    if (--vecs==0) { oddvec=r2; break; }
 
                     vst1q_f32(y_ptr + 8, r2);
-                    if(--vecs == 0)
-                    {
-                        oddvec = r3;
-                        break;
-                    }
+                    if (--vecs==0) { oddvec=r3; break; }
 
                     vst1q_f32(y_ptr + 12, r3);
-                    if(--vecs == 0)
-                    {
-                        oddvec = r4;
-                        break;
-                    }
+                    if (--vecs==0) { oddvec=r4; break; }
 
                     vst1q_f32(y_ptr + 16, r4);
-                    if(--vecs == 0)
-                    {
-                        oddvec = r5;
-                        break;
-                    }
+                    if (--vecs==0) { oddvec=r5; break; }
 
                     vst1q_f32(y_ptr + 20, r5);
-                    if(--vecs == 0)
-                    {
-                        oddvec = r6;
-                        break;
-                    }
+                    if (--vecs==0) { oddvec=r6; break; }
 
                     vst1q_f32(y_ptr + 24, r6);
-                    oddvec = r7;
-                }
-                while(0);
+                    oddvec=r7;
+                } while (0);
 
                 float *oddbase = y_ptr + l - oddbits;
 
-                switch(oddbits)
-                {
+                switch(oddbits) {
                     case 3:
                         vst1q_lane_f32(oddbase + 2, oddvec, 2);
-                    // fall through
+                        // fall through
                     case 2:
                         vst1q_lane_f32(oddbase + 1, oddvec, 1);
-                    // fall through
+                        // fall through
                     case 1:
                         vst1q_lane_f32(oddbase, oddvec, 0);
                         break;
@@ -734,56 +648,31 @@
                         // oddbits must be 1, 2 or 3.
                         UNREACHABLE("Impossible case in switch.");
                 }
-            }
-            else
-            {
+            } else {
                 // As above - medium path deals with vectors only
-                do
-                {
-                    if(vecs == 0)
-                    {
-                        UNREACHABLE("vecs and oddbits can't both be 0");
-                    }
+                do {
+                    if (vecs==0) { UNREACHABLE("vecs and oddbits can't both be 0"); }
 
                     vst1q_f32(y_ptr, r0);
-                    if(--vecs == 0)
-                    {
-                        break;
-                    }
+                    if (--vecs==0) { break; }
 
                     vst1q_f32(y_ptr + 4, r1);
-                    if(--vecs == 0)
-                    {
-                        break;
-                    }
+                    if (--vecs==0) { break; }
 
                     vst1q_f32(y_ptr + 8, r2);
-                    if(--vecs == 0)
-                    {
-                        break;
-                    }
+                    if (--vecs==0) { break; }
 
                     vst1q_f32(y_ptr + 12, r3);
-                    if(--vecs == 0)
-                    {
-                        break;
-                    }
+                    if (--vecs==0) { break; }
 
                     vst1q_f32(y_ptr + 16, r4);
-                    if(--vecs == 0)
-                    {
-                        break;
-                    }
+                    if (--vecs==0) { break; }
 
                     vst1q_f32(y_ptr + 20, r5);
-                    if(--vecs == 0)
-                    {
-                        break;
-                    }
+                    if (--vecs==0) { break; }
 
                     vst1q_f32(y_ptr + 24, r6);
-                }
-                while(0);
+                } while (0);
             }
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 5b9bd72..18c5c3a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,14 +25,13 @@
 
 #ifdef __aarch64__
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 // Actual kernel implementations
 void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
 
 // Transposed SGEMV strategy class.
-class sgemv_trans
-{
+class sgemv_trans {
 public:
     typedef float operand_type;
     typedef float result_type;
@@ -41,13 +40,11 @@
 
     /* Kernel blocking parameters */
     static const int out_width = 96;
-    static const int k_unroll  = 1;
+    static const int k_unroll = 1;
 
-    kern_type kernel = a64_sgemv_trans;
+    kern_type kernel=a64_sgemv_trans;
 
-    sgemv_trans(const CPUInfo *ci)
-    {
-    }
+    sgemv_trans(const CPUInfo *ci) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
index 8fa403b..64ef9d8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp

@@ -42,464 +42,472 @@
 // higher performance, but that's left to the outer loop.  In this kernel we
 // process all of M at the same time.
 
+
 // How far ahead to prefetch for the first and subsequent prefetches.
 // These values work for A72 on JunoR2...
 
 #define FIRST_PFD 9
 #define PFD 6
 
-namespace arm_gemm
-{
-void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float beta, int lda, int M, int N)
-{
+namespace arm_gemm {
+
+void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float beta, int lda, int M, int N) {
     const float *a_ptr_base = Astart;
-    float       *y_ptr      = Ystart;
+    float *y_ptr = Ystart;
 
     register const float32x4_t vb asm("v1") = vdupq_n_f32(beta);
 
-    int firstpfd = FIRST_PFD;
-    if(firstpfd > M)
-    {
-        firstpfd = (M - 1);
+    int firstpfd=FIRST_PFD;
+    if (firstpfd > M) {
+        firstpfd = (M-1);
     }
 
     int pfd = PFD;
-    if(pfd > M)
-    {
-        pfd = (M - 1);
+    if (pfd > M) {
+        pfd = (M-1);
     }
 
     ptrdiff_t jump = lda * sizeof(int);
 
-    for(; N >= 96; N -= 96)
-    {
-        int k = M - 1;
+    for (;N>=96;N-=96) {
+        int k = M-1;
 
-        const float *a_ptr       = a_ptr_base;
-        const float *x_ptr       = Xstart;
-        const float *pf_ptr      = a_ptr;
+        const float *a_ptr = a_ptr_base;
+        const float *x_ptr = Xstart;
+        const float *pf_ptr = a_ptr;
         const float *firstpf_ptr = a_ptr;
-        const float *pf_limit    = a_ptr + (M * lda);
+        const float *pf_limit = a_ptr + (M * lda);
 
-        for(int i = 0; i < firstpfd; i++)
-        {
+        for (int i=0; i<firstpfd; i++) {
             prefetch_1x(firstpf_ptr);
             firstpf_ptr += lda;
         }
 
-        for(int i = 0; i < pfd; i++)
-        {
+        for (int i=0; i<pfd; i++) {
             prefetch_5x(pf_ptr + 16);
             pf_ptr += lda;
         }
 
         a_ptr_base += 96;
 
-        __asm __volatile(
-            "movi    v8.4s,#0x0\n"
-            "ldr    w0, [%[x_ptr]]\n"
-            "movi    v9.4s,#0x0\n"
-            "ldr    q2,  [%[a_ptr], #0]\n"
-            "movi    v10.4s,#0x0\n"
-            "ldr    q3,  [%[a_ptr], #0x10]\n"
-            "movi    v11.4s,#0x0\n"
-            "ldr    q4, [%[a_ptr], #0x20]\n"
-            "movi    v12.4s,#0x0\n"
-            "ldr    q5, [%[a_ptr], #0x30]\n"
-            "movi    v13.4s,#0x0\n"
-            "ldr    q6, [%[a_ptr], #0x40]\n"
-            "movi    v14.4s,#0x0\n"
-            "ldr    q7, [%[a_ptr], #0x50]\n"
-            "movi    v15.4s,#0x0\n" ASM_PREFETCH("[%[firstpf_ptr]]")
-            "movi    v16.4s, #0x0\n"
-            "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #64]")
-            "movi    v18.4s, #0x0\n"
-            "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #128]")
-            "movi    v20.4s, #0x0\n"
-            "movi    v21.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #192]")
-            "movi    v22.4s, #0x0\n"
-            "movi    v23.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #256]")
-            "movi    v24.4s, #0x0\n"
-            "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #320]")
-            "movi    v26.4s, #0x0\n"
-            "movi    v27.4s, #0x0\n"
-            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
-            "movi    v28.4s, #0x0\n"
-            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "movi    v29.4s, #0x0\n"
-            "movi    v30.4s, #0x0\n"
-            "movi    v31.4s, #0x0\n"
+        __asm __volatile (
+            "movi	v8.4s,#0x0\n"
+            "ldr	w0, [%[x_ptr]]\n"
+            "movi	v9.4s,#0x0\n"
+            "ldr	q2,  [%[a_ptr], #0]\n"
+            "movi	v10.4s,#0x0\n"
+            "ldr	q3,  [%[a_ptr], #0x10]\n"
+            "movi	v11.4s,#0x0\n"
+            "ldr	q4, [%[a_ptr], #0x20]\n"
+            "movi	v12.4s,#0x0\n"
+            "ldr	q5, [%[a_ptr], #0x30]\n"
+            "movi	v13.4s,#0x0\n"
+            "ldr	q6, [%[a_ptr], #0x40]\n"
+            "movi	v14.4s,#0x0\n"
+            "ldr	q7, [%[a_ptr], #0x50]\n"
+            "movi	v15.4s,#0x0\n"
+            ASM_PREFETCH("[%[firstpf_ptr]]")
+            "movi	v16.4s, #0x0\n"
+            "movi	v17.4s, #0x0\n"
+            ASM_PREFETCH("[%[pf_ptr], #64]")
+            "movi	v18.4s, #0x0\n"
+            "movi	v19.4s, #0x0\n"
+            ASM_PREFETCH("[%[pf_ptr], #128]")
+            "movi	v20.4s, #0x0\n"
+            "movi	v21.4s, #0x0\n"
+            ASM_PREFETCH("[%[pf_ptr], #192]")
+            "movi	v22.4s, #0x0\n"
+            "movi	v23.4s, #0x0\n"
+            ASM_PREFETCH("[%[pf_ptr], #256]")
+            "movi	v24.4s, #0x0\n"
+            "movi	v25.4s, #0x0\n"
+            ASM_PREFETCH("[%[pf_ptr], #320]")
+            "movi	v26.4s, #0x0\n"
+            "movi	v27.4s, #0x0\n"
+            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
+            "movi	v28.4s, #0x0\n"
+            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "movi	v29.4s, #0x0\n"
+            "movi	v30.4s, #0x0\n"
+            "movi	v31.4s, #0x0\n"
 
             // Skip everything if there are no iterations of the main loop to do.
-            "cbz    %w[k], 10f\n"
+            "cbz	%w[k], 10f\n"
 
             // Loop with all prefetches.  Exit this loop when firstpf_ptr
             // hits pf_limit.
             "1:\n"
-            "dup    v0.4s, w0\n"
-            "ldr    w0, [%[x_ptr], #4]\n"
-            "add    %[x_ptr], %[x_ptr], #0x4\n"
-            "fmla    v8.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x60]\n"
-            "fmla    v9.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x70]\n" ASM_PREFETCH("[%[firstpf_ptr]]")
-            "fmla    v10.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x80]\n"
-            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "fmla    v11.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x90]\n"
-            "sub    %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla    v12.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0xa0]\n"
-            "fmla    v13.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla    v14.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0xc0]\n"
-            "fmla    v15.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0xd0]\n"
-            "fmla    v16.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0xe0]\n"
-            "fmla    v17.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla    v18.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x100]\n"
-            "fmla    v19.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x110]\n"
-            "fmla    v20.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x120]\n"
-            "fmla    v21.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla    v22.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x140]\n"
-            "fmla    v23.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x150]\n"
-            "fmla    v24.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x160]\n"
-            "fmla    v25.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add    %[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla    v26.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x00]\n"
-            "fmla    v27.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x10]\n"
-            "fmla    v28.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x20]\n"
-            "fmla    v29.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla    v30.4s, v6.4s, v0.4s\n"
-            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr    q6, [%[a_ptr], #0x40]\n"
-            "fmla    v31.4s, v7.4s, v0.4s\n"
-            "cmp    %[firstpf_ptr], %[pf_limit]\n"
-            "ldr    q7, [%[a_ptr], #0x50]\n"
-            "blt    1b\n"
+            "dup	v0.4s, w0\n"
+            "ldr	w0, [%[x_ptr], #4]\n"
+            "add	%[x_ptr], %[x_ptr], #0x4\n"
+            "fmla	v8.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x60]\n"
+            "fmla	v9.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x70]\n"
+            ASM_PREFETCH("[%[firstpf_ptr]]")
+            "fmla	v10.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x80]\n"
+            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "fmla	v11.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x90]\n"
+            "sub	%w[k], %w[k], #1\n"
+            ASM_PREFETCH("[%[x_ptr], #128]")
+            "fmla	v12.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0xa0]\n"
+            "fmla	v13.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0xb0]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "fmla	v14.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0xc0]\n"
+            "fmla	v15.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0xd0]\n"
+            "fmla	v16.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0xe0]\n"
+            "fmla	v17.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0xf0]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "fmla	v18.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x100]\n"
+            "fmla	v19.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x110]\n"
+            "fmla	v20.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x120]\n"
+            "fmla	v21.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x130]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "fmla	v22.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x140]\n"
+            "fmla	v23.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x150]\n"
+            "fmla	v24.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x160]\n"
+            "fmla	v25.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x170]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "add	%[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla	v26.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x00]\n"
+            "fmla	v27.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x10]\n"
+            "fmla	v28.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x20]\n"
+            "fmla	v29.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x30]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "fmla	v30.4s, v6.4s, v0.4s\n"
+            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
+            "ldr	q6, [%[a_ptr], #0x40]\n"
+            "fmla	v31.4s, v7.4s, v0.4s\n"
+            "cmp	%[firstpf_ptr], %[pf_limit]\n"
+            "ldr	q7, [%[a_ptr], #0x50]\n"
+            "blt	1b\n"
 
             // Check that there are still "main" prefetches to do.
-            "cmp    %[pf_ptr], %[pf_limit]\n"
-            "bge    9f\n"
+            "cmp	%[pf_ptr], %[pf_limit]\n"
+            "bge	9f\n"
 
             // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
             "8:\n"
-            "dup    v0.4s, w0\n"
-            "ldr    w0, [%[x_ptr], #4]\n"
-            "add    %[x_ptr], %[x_ptr], #0x4\n"
-            "fmla    v8.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x60]\n"
-            "fmla    v9.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x70]\n"
-            "fmla    v10.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x80]\n"
-            "fmla    v11.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x90]\n"
-            "sub    %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla    v12.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0xa0]\n"
-            "fmla    v13.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla    v14.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0xc0]\n"
-            "fmla    v15.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0xd0]\n"
-            "fmla    v16.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0xe0]\n"
-            "fmla    v17.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla    v18.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x100]\n"
-            "fmla    v19.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x110]\n"
-            "fmla    v20.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x120]\n"
-            "fmla    v21.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla    v22.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x140]\n"
-            "fmla    v23.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x150]\n"
-            "fmla    v24.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x160]\n"
-            "fmla    v25.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add    %[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla    v26.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x00]\n"
-            "fmla    v27.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x10]\n"
-            "fmla    v28.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x20]\n"
-            "fmla    v29.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla    v30.4s, v6.4s, v0.4s\n"
-            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr    q6, [%[a_ptr], #0x40]\n"
-            "fmla    v31.4s, v7.4s, v0.4s\n"
-            "cmp    %[pf_ptr], %[pf_limit]\n"
-            "ldr    q7, [%[a_ptr], #0x50]\n"
-            "blt    8b\n"
+            "dup	v0.4s, w0\n"
+            "ldr	w0, [%[x_ptr], #4]\n"
+            "add	%[x_ptr], %[x_ptr], #0x4\n"
+            "fmla	v8.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x60]\n"
+            "fmla	v9.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x70]\n"
+            "fmla	v10.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x80]\n"
+            "fmla	v11.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x90]\n"
+            "sub	%w[k], %w[k], #1\n"
+            ASM_PREFETCH("[%[x_ptr], #128]")
+            "fmla	v12.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0xa0]\n"
+            "fmla	v13.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0xb0]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "fmla	v14.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0xc0]\n"
+            "fmla	v15.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0xd0]\n"
+            "fmla	v16.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0xe0]\n"
+            "fmla	v17.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0xf0]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "fmla	v18.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x100]\n"
+            "fmla	v19.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x110]\n"
+            "fmla	v20.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x120]\n"
+            "fmla	v21.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x130]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "fmla	v22.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x140]\n"
+            "fmla	v23.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x150]\n"
+            "fmla	v24.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x160]\n"
+            "fmla	v25.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x170]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "add	%[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla	v26.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x00]\n"
+            "fmla	v27.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x10]\n"
+            "fmla	v28.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x20]\n"
+            "fmla	v29.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x30]\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "fmla	v30.4s, v6.4s, v0.4s\n"
+            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
+            "ldr	q6, [%[a_ptr], #0x40]\n"
+            "fmla	v31.4s, v7.4s, v0.4s\n"
+            "cmp	%[pf_ptr], %[pf_limit]\n"
+            "ldr	q7, [%[a_ptr], #0x50]\n"
+            "blt	8b\n"
 
             // Check that there is still work to do.
             "9:\n"
-            "cmp    %w[k], #0\n"
-            "beq    10f\n"
+            "cmp	%w[k], #0\n"
+            "beq	10f\n"
 
             // Loop without prefetches, exit when k hits 0.
             "2:\n"
-            "dup    v0.4s, w0\n"
-            "ldr    w0, [%[x_ptr], #4]\n"
-            "add    %[x_ptr], %[x_ptr], #0x4\n"
-            "fmla    v8.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x60]\n"
-            "fmla    v9.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x70]\n"
-            "fmla    v10.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x80]\n"
-            "fmla    v11.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x90]\n"
-            "subs    %w[k], %w[k], #1\n"
-            "fmla    v12.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0xa0]\n"
-            "fmla    v13.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0xb0]\n"
-            "fmla    v14.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0xc0]\n"
-            "fmla    v15.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0xd0]\n"
-            "fmla    v16.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0xe0]\n"
-            "fmla    v17.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0xf0]\n"
-            "fmla    v18.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x100]\n"
-            "fmla    v19.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x110]\n"
-            "fmla    v20.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x120]\n"
-            "fmla    v21.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x130]\n"
-            "fmla    v22.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x140]\n"
-            "fmla    v23.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x150]\n"
-            "fmla    v24.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x160]\n"
-            "fmla    v25.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x170]\n"
-            "add    %[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla    v26.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x00]\n"
-            "fmla    v27.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x10]\n"
-            "fmla    v28.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x20]\n"
-            "fmla    v29.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x30]\n"
-            "fmla    v30.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x40]\n"
-            "fmla    v31.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x50]\n"
-            "bne    2b\n"
+            "dup	v0.4s, w0\n"
+            "ldr	w0, [%[x_ptr], #4]\n"
+            "add	%[x_ptr], %[x_ptr], #0x4\n"
+            "fmla	v8.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x60]\n"
+            "fmla	v9.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x70]\n"
+            "fmla	v10.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x80]\n"
+            "fmla	v11.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x90]\n"
+            "subs	%w[k], %w[k], #1\n"
+            "fmla	v12.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0xa0]\n"
+            "fmla	v13.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0xb0]\n"
+            "fmla	v14.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0xc0]\n"
+            "fmla	v15.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0xd0]\n"
+            "fmla	v16.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0xe0]\n"
+            "fmla	v17.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0xf0]\n"
+            "fmla	v18.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x100]\n"
+            "fmla	v19.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x110]\n"
+            "fmla	v20.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x120]\n"
+            "fmla	v21.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x130]\n"
+            "fmla	v22.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x140]\n"
+            "fmla	v23.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x150]\n"
+            "fmla	v24.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x160]\n"
+            "fmla	v25.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x170]\n"
+            "add	%[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla	v26.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x00]\n"
+            "fmla	v27.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x10]\n"
+            "fmla	v28.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x20]\n"
+            "fmla	v29.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x30]\n"
+            "fmla	v30.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x40]\n"
+            "fmla	v31.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x50]\n"
+            "bne	2b\n"
 
             "10:\n"
 
             // Final iteration
-            "dup    v0.4s, w0\n"
-            "fmla    v8.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x60]\n"
-            "fmla    v9.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x70]\n"
-            "fmla    v10.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x80]\n"
-            "fmla    v11.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x90]\n"
-            "fmla    v12.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0xa0]\n"
-            "fmla    v13.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0xb0]\n"
-            "fmla    v14.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0xc0]\n"
-            "fmla    v15.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0xd0]\n"
-            "fmla    v16.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0xe0]\n"
-            "fmla    v17.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0xf0]\n"
-            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "dup	v0.4s, w0\n"
+            "fmla	v8.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x60]\n"
+            "fmla	v9.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x70]\n"
+            "fmla	v10.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x80]\n"
+            "fmla	v11.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x90]\n"
+            "fmla	v12.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0xa0]\n"
+            "fmla	v13.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0xb0]\n"
+            "fmla	v14.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0xc0]\n"
+            "fmla	v15.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0xd0]\n"
+            "fmla	v16.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0xe0]\n"
+            "fmla	v17.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0xf0]\n"
+            "fmla	v18.4s, v6.4s, v0.4s\n"
 
-            "ldr    q6, [%[a_ptr], #0x100]\n"
-            "fmla    v19.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x110]\n"
-            "fmla    v20.4s, v2.4s, v0.4s\n"
-            "ldr    q2, [%[a_ptr], #0x120]\n"
-            "fmla    v21.4s, v3.4s, v0.4s\n"
-            "ldr    q3, [%[a_ptr], #0x130]\n"
-            "fmla    v22.4s, v4.4s, v0.4s\n"
-            "ldr    q4, [%[a_ptr], #0x140]\n"
-            "fmla    v23.4s, v5.4s, v0.4s\n"
-            "ldr    q5, [%[a_ptr], #0x150]\n"
-            "fmla    v24.4s, v6.4s, v0.4s\n"
-            "ldr    q6, [%[a_ptr], #0x160]\n"
-            "fmla    v25.4s, v7.4s, v0.4s\n"
-            "ldr    q7, [%[a_ptr], #0x170]\n"
-            "fmla    v26.4s, v2.4s, v0.4s\n"
-            "ldr    q2,  [%[y_ptr]]\n"
-            "fmla    v27.4s, v3.4s, v0.4s\n"
-            "ldr    q3,  [%[y_ptr], #0x10]\n"
-            "fmla    v28.4s, v4.4s, v0.4s\n"
-            "ldr    q4,  [%[y_ptr], #0x20]\n"
-            "fmla    v29.4s, v5.4s, v0.4s\n"
-            "ldr    q5,  [%[y_ptr], #0x30]\n"
-            "fmla    v30.4s, v6.4s, v0.4s\n"
-            "ldr    q6,  [%[y_ptr], #0x40]\n"
-            "fmla    v31.4s, v7.4s, v0.4s\n"
-            "ldr    q7,  [%[y_ptr], #0x50]\n"
+            "ldr	q6, [%[a_ptr], #0x100]\n"
+            "fmla	v19.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x110]\n"
+            "fmla	v20.4s, v2.4s, v0.4s\n"
+            "ldr	q2, [%[a_ptr], #0x120]\n"
+            "fmla	v21.4s, v3.4s, v0.4s\n"
+            "ldr	q3, [%[a_ptr], #0x130]\n"
+            "fmla	v22.4s, v4.4s, v0.4s\n"
+            "ldr	q4, [%[a_ptr], #0x140]\n"
+            "fmla	v23.4s, v5.4s, v0.4s\n"
+            "ldr	q5, [%[a_ptr], #0x150]\n"
+            "fmla	v24.4s, v6.4s, v0.4s\n"
+            "ldr	q6, [%[a_ptr], #0x160]\n"
+            "fmla	v25.4s, v7.4s, v0.4s\n"
+            "ldr	q7, [%[a_ptr], #0x170]\n"
+            "fmla	v26.4s, v2.4s, v0.4s\n"
+            "ldr	q2,  [%[y_ptr]]\n"
+            "fmla	v27.4s, v3.4s, v0.4s\n"
+            "ldr	q3,  [%[y_ptr], #0x10]\n"
+            "fmla	v28.4s, v4.4s, v0.4s\n"
+            "ldr	q4,  [%[y_ptr], #0x20]\n"
+            "fmla	v29.4s, v5.4s, v0.4s\n"
+            "ldr	q5,  [%[y_ptr], #0x30]\n"
+            "fmla	v30.4s, v6.4s, v0.4s\n"
+            "ldr	q6,  [%[y_ptr], #0x40]\n"
+            "fmla	v31.4s, v7.4s, v0.4s\n"
+            "ldr	q7,  [%[y_ptr], #0x50]\n"
 
-            "fmla    v8.4s, v2.4s, %[vb].4s\n"
-            "ldr    q2, [%[y_ptr], #0x60]\n"
-            "fmla    v9.4s, v3.4s, %[vb].4s\n"
-            "ldr    q3, [%[y_ptr], #0x70]\n"
-            "fmla    v10.4s, v4.4s, %[vb].4s\n"
-            "ldr    q4, [%[y_ptr], #0x80]\n"
-            "fmla    v11.4s, v5.4s, %[vb].4s\n"
-            "ldr    q5, [%[y_ptr], #0x90]\n"
-            "fmla    v12.4s, v6.4s, %[vb].4s\n"
-            "ldr    q6, [%[y_ptr], #0xa0]\n"
-            "str    q8, [%[y_ptr], #0x00]\n"
-            "fmla    v13.4s, v7.4s, %[vb].4s\n"
-            "ldr    q7, [%[y_ptr], #0xb0]\n"
-            "str    q9, [%[y_ptr], #0x10]\n"
-            "fmla    v14.4s, v2.4s, %[vb].4s\n"
-            "ldr    q2, [%[y_ptr], #0xc0]\n"
-            "str    q10, [%[y_ptr], #0x20]\n"
-            "fmla    v15.4s, v3.4s, %[vb].4s\n"
-            "ldr    q3, [%[y_ptr], #0xd0]\n"
-            "str    q11, [%[y_ptr], #0x30]\n"
-            "fmla    v16.4s, v4.4s, %[vb].4s\n"
-            "ldr    q4, [%[y_ptr], #0xe0]\n"
-            "str    q12, [%[y_ptr], #0x40]\n"
-            "fmla    v17.4s, v5.4s, %[vb].4s\n"
-            "ldr    q5, [%[y_ptr], #0xf0]\n"
-            "str    q13, [%[y_ptr], #0x50]\n"
-            "fmla    v18.4s, v6.4s, %[vb].4s\n"
-            "ldr    q6, [%[y_ptr], #0x100]\n"
-            "str    q14, [%[y_ptr], #0x60]\n"
-            "fmla    v19.4s, v7.4s, %[vb].4s\n"
-            "ldr    q7, [%[y_ptr], #0x110]\n"
-            "str    q15, [%[y_ptr], #0x70]\n"
-            "fmla    v20.4s, v2.4s, %[vb].4s\n"
-            "ldr    q2, [%[y_ptr], #0x120]\n"
-            "str    q16, [%[y_ptr], #0x80]\n"
-            "fmla    v21.4s, v3.4s, %[vb].4s\n"
-            "ldr    q3, [%[y_ptr], #0x130]\n"
-            "str    q17, [%[y_ptr], #0x90]\n"
-            "fmla    v22.4s, v4.4s, %[vb].4s\n"
-            "ldr    q4, [%[y_ptr], #0x140]\n"
-            "str    q18, [%[y_ptr], #0xa0]\n"
-            "fmla    v23.4s, v5.4s, %[vb].4s\n"
-            "ldr    q5, [%[y_ptr], #0x150]\n"
-            "str    q19, [%[y_ptr], #0xb0]\n"
-            "fmla    v24.4s, v6.4s, %[vb].4s\n"
-            "ldr    q6, [%[y_ptr], #0x160]\n"
-            "str    q20, [%[y_ptr], #0xc0]\n"
-            "fmla    v25.4s, v7.4s, %[vb].4s\n"
-            "ldr    q7, [%[y_ptr], #0x170]\n"
-            "str    q21, [%[y_ptr], #0xd0]\n"
-            "fmla    v26.4s, v2.4s, %[vb].4s\n"
-            "str    q22, [%[y_ptr], #0xe0]\n"
-            "fmla    v27.4s, v3.4s, %[vb].4s\n"
-            "str    q23, [%[y_ptr], #0xf0]\n"
-            "fmla    v28.4s, v4.4s, %[vb].4s\n"
-            "str    q24, [%[y_ptr], #0x100]\n"
-            "fmla    v29.4s, v5.4s, %[vb].4s\n"
-            "str    q25, [%[y_ptr], #0x110]\n"
-            "fmla    v30.4s, v6.4s, %[vb].4s\n"
-            "str    q26, [%[y_ptr], #0x120]\n"
-            "fmla    v31.4s, v7.4s, %[vb].4s\n"
-            "str    q27, [%[y_ptr], #0x130]\n"
+            "fmla	v8.4s, v2.4s, %[vb].4s\n"
+            "ldr	q2, [%[y_ptr], #0x60]\n"
+            "fmla	v9.4s, v3.4s, %[vb].4s\n"
+            "ldr	q3, [%[y_ptr], #0x70]\n"
+            "fmla	v10.4s, v4.4s, %[vb].4s\n"
+            "ldr	q4, [%[y_ptr], #0x80]\n"
+            "fmla	v11.4s, v5.4s, %[vb].4s\n"
+            "ldr	q5, [%[y_ptr], #0x90]\n"
+            "fmla	v12.4s, v6.4s, %[vb].4s\n"
+            "ldr	q6, [%[y_ptr], #0xa0]\n"
+            "str	q8, [%[y_ptr], #0x00]\n"
+            "fmla	v13.4s, v7.4s, %[vb].4s\n"
+            "ldr	q7, [%[y_ptr], #0xb0]\n"
+            "str	q9, [%[y_ptr], #0x10]\n"
+            "fmla	v14.4s, v2.4s, %[vb].4s\n"
+            "ldr	q2, [%[y_ptr], #0xc0]\n"
+            "str	q10, [%[y_ptr], #0x20]\n"
+            "fmla	v15.4s, v3.4s, %[vb].4s\n"
+            "ldr	q3, [%[y_ptr], #0xd0]\n"
+            "str	q11, [%[y_ptr], #0x30]\n"
+            "fmla	v16.4s, v4.4s, %[vb].4s\n"
+            "ldr	q4, [%[y_ptr], #0xe0]\n"
+            "str	q12, [%[y_ptr], #0x40]\n"
+            "fmla	v17.4s, v5.4s, %[vb].4s\n"
+            "ldr	q5, [%[y_ptr], #0xf0]\n"
+            "str	q13, [%[y_ptr], #0x50]\n"
+            "fmla	v18.4s, v6.4s, %[vb].4s\n"
+            "ldr	q6, [%[y_ptr], #0x100]\n"
+            "str	q14, [%[y_ptr], #0x60]\n"
+            "fmla	v19.4s, v7.4s, %[vb].4s\n"
+            "ldr	q7, [%[y_ptr], #0x110]\n"
+            "str	q15, [%[y_ptr], #0x70]\n"
+            "fmla	v20.4s, v2.4s, %[vb].4s\n"
+            "ldr	q2, [%[y_ptr], #0x120]\n"
+            "str	q16, [%[y_ptr], #0x80]\n"
+            "fmla	v21.4s, v3.4s, %[vb].4s\n"
+            "ldr	q3, [%[y_ptr], #0x130]\n"
+            "str	q17, [%[y_ptr], #0x90]\n"
+            "fmla	v22.4s, v4.4s, %[vb].4s\n"
+            "ldr	q4, [%[y_ptr], #0x140]\n"
+            "str	q18, [%[y_ptr], #0xa0]\n"
+            "fmla	v23.4s, v5.4s, %[vb].4s\n"
+            "ldr	q5, [%[y_ptr], #0x150]\n"
+            "str	q19, [%[y_ptr], #0xb0]\n"
+            "fmla	v24.4s, v6.4s, %[vb].4s\n"
+            "ldr	q6, [%[y_ptr], #0x160]\n"
+            "str	q20, [%[y_ptr], #0xc0]\n"
+            "fmla	v25.4s, v7.4s, %[vb].4s\n"
+            "ldr	q7, [%[y_ptr], #0x170]\n"
+            "str	q21, [%[y_ptr], #0xd0]\n"
+            "fmla	v26.4s, v2.4s, %[vb].4s\n"
+            "str	q22, [%[y_ptr], #0xe0]\n"
+            "fmla	v27.4s, v3.4s, %[vb].4s\n"
+            "str	q23, [%[y_ptr], #0xf0]\n"
+            "fmla	v28.4s, v4.4s, %[vb].4s\n"
+            "str	q24, [%[y_ptr], #0x100]\n"
+            "fmla	v29.4s, v5.4s, %[vb].4s\n"
+            "str	q25, [%[y_ptr], #0x110]\n"
+            "fmla	v30.4s, v6.4s, %[vb].4s\n"
+            "str	q26, [%[y_ptr], #0x120]\n"
+            "fmla	v31.4s, v7.4s, %[vb].4s\n"
+            "str	q27, [%[y_ptr], #0x130]\n"
 
-            "stp    q28, q29, [%[y_ptr], #0x140]\n"
-            "stp    q30, q31, [%[y_ptr], #0x160]\n"
-            "add    %[y_ptr], %[y_ptr], #0x180\n"
+            "stp	q28, q29, [%[y_ptr], #0x140]\n"
+            "stp	q30, q31, [%[y_ptr], #0x160]\n"
+            "add	%[y_ptr], %[y_ptr], #0x180\n"
 
-            : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr)
-            : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit)
-            : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr)
+          : [jump] "r" (jump), [vb] "w" (vb), [pf_limit] "r" (pf_limit)
+          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
             "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc");
+            "v27", "v28", "v29", "v30", "v31", "cc"
+        );
     }
 
-    if(N > 0)
-    {
+    if (N>0) {
         // Handle N tail - up to 95 stragglers.
         // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
         // single value for the remainder.
 
         // Independent pointers into the matrix for the odd 2 and odd 1.
         // Double up as flag to indicate whether they are needed.
-        const float *odd2_aptr = NULL;
-        const float *odd1_aptr = NULL;
+        const float *odd2_aptr=NULL;
+        const float *odd1_aptr=NULL;
 
         // Figure out how much work we need to do.
-        int numvecs = N / 4;
-        int rem     = N % 4;
-        int k       = M;
+        int numvecs = N/4;
+        int rem = N%4;
+        int k=M;
 
         // Set up pointers for the odd 2/1 if needed.
-        if(rem >= 2)
-        {
+        if (rem >= 2) {
             odd2_aptr = a_ptr_base + (numvecs * 4);
         }
 
-        if(rem & 1)
-        {
-            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr == NULL ? 0 : 2);
+        if (rem & 1) {
+            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2);
         }
 
-        const float *a_ptr       = a_ptr_base;
+        const float *a_ptr = a_ptr_base;
         const float *firstpf_ptr = a_ptr_base;
-        const float *pf_ptr      = a_ptr_base;
-        const float *pf_limit    = a_ptr + (M * lda);
+        const float *pf_ptr = a_ptr_base;
+        const float *pf_limit = a_ptr + (M * lda);
 
         const float *x_ptr = Xstart;
-        int          vecs  = 0; // Working variable to count how many vectors to work on.
-        int          dopf  = 1; // Track whether we are doing prefetches.
+        int vecs=0; // Working variable to count how many vectors to work on.
+        int dopf=1; // Track whether we are doing prefetches.
 
         // Figure out how many cache lines we need to prefetch each time.
         int numpfs = (N + 15) / 16;
 
         // Do initial prefetches
-        for(int i = 0; i < firstpfd + 1; i++)
-        {
+        for (int i=0; i<firstpfd+1; i++) {
             prefetch_1x(firstpf_ptr);
             firstpf_ptr += lda;
         }
 
         // Do "main" prefetches - adapt number to the number we actually need.
-        if(numpfs > 1)
-        {
-            for(int i = 0; i < pfd + 1; i++)
-            {
-                switch(numpfs)
-                {
+        if (numpfs > 1) {
+            for (int i=0; i<pfd+1; i++) {
+                switch (numpfs) {
                     case 2:
                         prefetch_1x(pf_ptr + 16);
                         break;
@@ -525,387 +533,392 @@
                 }
                 pf_ptr += lda;
             }
-        }
-        else
-        {
+        } else {
             // Just disable additional prefetches
-            dopf = 0;
+            dopf=0;
         }
 
         // Do the real work
-        __asm __volatile(
+        __asm __volatile (
             // Initialize all the vectors - not worth skipping this if only
             // some are needed.
-            "movi    v8.4s,#0x0\n"
-            "ldr    w0, [%[x_ptr]]\n"
-            "movi    v9.4s,#0x0\n"
-            "movi    v10.4s,#0x0\n"
-            "movi    v11.4s,#0x0\n"
-            "movi    v12.4s,#0x0\n"
-            "movi    v13.4s,#0x0\n"
-            "movi    v14.4s,#0x0\n"
-            "movi    v15.4s,#0x0\n"
-            "movi    v16.4s, #0x0\n"
-            "movi    v17.4s, #0x0\n"
-            "movi    v18.4s, #0x0\n"
-            "movi    v19.4s, #0x0\n"
-            "movi    v20.4s, #0x0\n"
-            "movi    v21.4s, #0x0\n"
-            "movi    v22.4s, #0x0\n"
-            "movi    v23.4s, #0x0\n"
-            "movi    v24.4s, #0x0\n"
-            "movi    v25.4s, #0x0\n"
-            "movi    v26.4s, #0x0\n"
-            "movi    v27.4s, #0x0\n"
-            "movi    v28.4s, #0x0\n"
-            "movi    v29.4s, #0x0\n"
-            "movi    v30.4s, #0x0\n"
-            "movi    v6.2s, #0x0\n"
-            "movi    v5.2s, #0x0\n"
+            "movi	v8.4s,#0x0\n"
+            "ldr	w0, [%[x_ptr]]\n"
+            "movi	v9.4s,#0x0\n"
+            "movi	v10.4s,#0x0\n"
+            "movi	v11.4s,#0x0\n"
+            "movi	v12.4s,#0x0\n"
+            "movi	v13.4s,#0x0\n"
+            "movi	v14.4s,#0x0\n"
+            "movi	v15.4s,#0x0\n"
+            "movi	v16.4s, #0x0\n"
+            "movi	v17.4s, #0x0\n"
+            "movi	v18.4s, #0x0\n"
+            "movi	v19.4s, #0x0\n"
+            "movi	v20.4s, #0x0\n"
+            "movi	v21.4s, #0x0\n"
+            "movi	v22.4s, #0x0\n"
+            "movi	v23.4s, #0x0\n"
+            "movi	v24.4s, #0x0\n"
+            "movi	v25.4s, #0x0\n"
+            "movi	v26.4s, #0x0\n"
+            "movi	v27.4s, #0x0\n"
+            "movi	v28.4s, #0x0\n"
+            "movi	v29.4s, #0x0\n"
+            "movi	v30.4s, #0x0\n"
+            "movi	v6.2s, #0x0\n"
+            "movi	v5.2s, #0x0\n"
 
-            "1:\n" ASM_PREFETCH("[%[firstpf_ptr]]\n")
+            "1:\n"
+            ASM_PREFETCH("[%[firstpf_ptr]]\n")
             "11:\n"
-            "dup    v0.4s, w0\n"
-            "ldr    w0, [%[x_ptr], #4]\n"
-            "add    %[x_ptr], %[x_ptr], #4\n"
+            "dup	v0.4s, w0\n"
+            "ldr	w0, [%[x_ptr], #4]\n"
+            "add	%[x_ptr], %[x_ptr], #4\n"
 
-            "cbz    %w[numvecs], 2f\n"
-            "mov    %w[vecs], %w[numvecs]\n"
+            "cbz	%w[numvecs], 2f\n"
+            "mov	%w[vecs], %w[numvecs]\n"
 
             // Vector 0
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x00]\n"
-            "fmla    v8.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x00]\n"
+            "fmla	v8.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 1
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x10]\n"
-            "fmla    v9.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x10]\n"
+            "fmla	v9.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 2
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x20]\n"
-            "fmla    v10.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x20]\n"
+            "fmla	v10.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 3
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x30]\n"
-            "fmla    v11.4s, v7.4s, v0.4s\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x30]\n"
+            "fmla	v11.4s, v7.4s, v0.4s\n"
             // Prefetch
-            "cbz    %w[dopf], 3f\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "cbz	%w[dopf], 3f\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x40]")
             "3:\n"
-            "beq    2f\n"
+            "beq	2f\n"
 
             // Vector 4
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x40]\n"
-            "fmla    v12.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x40]\n"
+            "fmla	v12.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 5
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x50]\n"
-            "fmla    v13.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x50]\n"
+            "fmla	v13.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 6
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x60]\n"
-            "fmla    v14.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x60]\n"
+            "fmla	v14.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 7
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x70]\n"
-            "fmla    v15.4s, v7.4s, v0.4s\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x70]\n"
+            "fmla	v15.4s, v7.4s, v0.4s\n"
             // Prefetch
-            "cbz    %w[dopf], 4f\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "cbz	%w[dopf], 4f\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x80]")
             "4:\n"
-            "beq    2f\n"
+            "beq	2f\n"
 
             // Vector 8
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x80]\n"
-            "fmla    v16.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x80]\n"
+            "fmla	v16.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 9
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x90]\n"
-            "fmla    v17.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x90]\n"
+            "fmla	v17.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 10
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0xa0]\n"
-            "fmla    v18.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0xa0]\n"
+            "fmla	v18.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 11
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0xb0]\n"
-            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0xb0]\n"
+            "fmla	v19.4s, v7.4s, v0.4s\n"
             // Prefetch
-            "cbz    %w[dopf], 5f\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "cbz	%w[dopf], 5f\n"
+            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
             "5:\n"
-            "beq    2f\n"
+            "beq	2f\n"
 
             // Vector 12
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0xc0]\n"
-            "fmla    v20.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0xc0]\n"
+            "fmla	v20.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 13
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0xd0]\n"
-            "fmla    v21.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0xd0]\n"
+            "fmla	v21.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 14
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0xe0]\n"
-            "fmla    v22.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0xe0]\n"
+            "fmla	v22.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 15
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0xf0]\n"
-            "fmla    v23.4s, v7.4s, v0.4s\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0xf0]\n"
+            "fmla	v23.4s, v7.4s, v0.4s\n"
             // Prefetch
-            "cbz    %w[dopf], 6f\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "cbz	%w[dopf], 6f\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x100]")
             "6:\n"
-            "beq    2f\n"
+            "beq	2f\n"
 
             // Vector 16
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x100]\n"
-            "fmla    v24.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x100]\n"
+            "fmla	v24.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 17
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x110]\n"
-            "fmla    v25.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x110]\n"
+            "fmla	v25.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 18
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x120]\n"
-            "fmla    v26.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x120]\n"
+            "fmla	v26.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 19
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x130]\n"
-            "fmla    v27.4s, v7.4s, v0.4s\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x130]\n"
+            "fmla	v27.4s, v7.4s, v0.4s\n"
             // Prefetch
-            "cbz    %w[dopf], 7f\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "cbz	%w[dopf], 7f\n"
+            ASM_PREFETCH("[%[pf_ptr], #0x140]")
             "7:\n"
-            "beq    2f\n"
+            "beq	2f\n"
 
             // Vector 20
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x140]\n"
-            "fmla    v28.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x140]\n"
+            "fmla	v28.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 21
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x150]\n"
-            "fmla    v29.4s, v7.4s, v0.4s\n"
-            "beq    2f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x150]\n"
+            "fmla	v29.4s, v7.4s, v0.4s\n"
+            "beq	2f\n"
             // Vector 22
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7,[%[a_ptr], #0x160]\n"
-            "fmla    v30.4s, v7.4s, v0.4s\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7,[%[a_ptr], #0x160]\n"
+            "fmla	v30.4s, v7.4s, v0.4s\n"
 
             "2:\n"
-            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "add	%[a_ptr], %[a_ptr], %[jump]\n"
 
             // Do the odd 2-vector, if needed
-            "cbz    %[odd2_aptr], 8f\n"
-            "ldr    d7, [%[odd2_aptr]]\n"
-            "fmla    v6.2s, v7.2s, v0.2s\n"
-            "add    %[odd2_aptr], %[odd2_aptr], %[jump]\n"
+            "cbz	%[odd2_aptr], 8f\n"
+            "ldr	d7, [%[odd2_aptr]]\n"
+            "fmla	v6.2s, v7.2s, v0.2s\n"
+            "add	%[odd2_aptr], %[odd2_aptr], %[jump]\n"
 
             "8:\n"
             // Do the odd 1-vector, if needed
-            "cbz    %[odd1_aptr], 9f\n"
-            "ldr    s7, [%[odd1_aptr]]\n"
-            "fmla    v5.2s, v7.2s, v0.2s\n"
-            "add    %[odd1_aptr], %[odd1_aptr], %[jump]\n"
+            "cbz	%[odd1_aptr], 9f\n"
+            "ldr	s7, [%[odd1_aptr]]\n"
+            "fmla	v5.2s, v7.2s, v0.2s\n"
+            "add	%[odd1_aptr], %[odd1_aptr], %[jump]\n"
 
             // Get out if needed.
             "9:\n"
-            "subs    %w[k], %w[k], #1\n"
-            "beq    10f\n"
+            "subs	%w[k], %w[k], #1\n"
+            "beq	10f\n"
 
             // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
-            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
-            "cmp    %[pf_ptr], %[pf_limit]\n"
-            "csel    %w[dopf], %w[dopf], WZR, LT\n"
+            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
+            "cmp	%[pf_ptr], %[pf_limit]\n"
+            "csel	%w[dopf], %w[dopf], WZR, LT\n"
 
             // Update the "leading" prefetch pointer, don't do the first
             // instruction of the loop if it's over the limit.
-            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "cmp    %[firstpf_ptr], %[pf_limit]\n"
-            "blt    1b\n"
-            "b        11b\n"
+            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "cmp	%[firstpf_ptr], %[pf_limit]\n"
+            "blt	1b\n"
+            "b		11b\n"
 
             // Now write out the outputs
             "10:\n"
-            "cbz    %w[numvecs], 12f\n"
-            "mov    %w[vecs], %w[numvecs]\n"
+            "cbz	%w[numvecs], 12f\n"
+            "mov	%w[vecs], %w[numvecs]\n"
 
             // Vector 0
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v8.4s, v7.4s, %[vb].4s\n"
-            "str    q8, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v8.4s, v7.4s, %[vb].4s\n"
+            "str	q8, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 1
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v9.4s, v7.4s, %[vb].4s\n"
-            "str    q9, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v9.4s, v7.4s, %[vb].4s\n"
+            "str	q9, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 2
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v10.4s, v7.4s, %[vb].4s\n"
-            "str    q10, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v10.4s, v7.4s, %[vb].4s\n"
+            "str	q10, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 3
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v11.4s, v7.4s, %[vb].4s\n"
-            "str    q11, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v11.4s, v7.4s, %[vb].4s\n"
+            "str	q11, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 4
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v12.4s, v7.4s, %[vb].4s\n"
-            "str    q12, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v12.4s, v7.4s, %[vb].4s\n"
+            "str	q12, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 5
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v13.4s, v7.4s, %[vb].4s\n"
-            "str    q13, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v13.4s, v7.4s, %[vb].4s\n"
+            "str	q13, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 6
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v14.4s, v7.4s, %[vb].4s\n"
-            "str    q14, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v14.4s, v7.4s, %[vb].4s\n"
+            "str	q14, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 7
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v15.4s, v7.4s, %[vb].4s\n"
-            "str    q15, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v15.4s, v7.4s, %[vb].4s\n"
+            "str	q15, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 8
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v16.4s, v7.4s, %[vb].4s\n"
-            "str    q16, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v16.4s, v7.4s, %[vb].4s\n"
+            "str	q16, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 9
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v17.4s, v7.4s, %[vb].4s\n"
-            "str    q17, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v17.4s, v7.4s, %[vb].4s\n"
+            "str	q17, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 10
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v18.4s, v7.4s, %[vb].4s\n"
-            "str    q18, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v18.4s, v7.4s, %[vb].4s\n"
+            "str	q18, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 11
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v19.4s, v7.4s, %[vb].4s\n"
-            "str    q19, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v19.4s, v7.4s, %[vb].4s\n"
+            "str	q19, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 12
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v20.4s, v7.4s, %[vb].4s\n"
-            "str    q20, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v20.4s, v7.4s, %[vb].4s\n"
+            "str	q20, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 13
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v21.4s, v7.4s, %[vb].4s\n"
-            "str    q21, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v21.4s, v7.4s, %[vb].4s\n"
+            "str	q21, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 14
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v22.4s, v7.4s, %[vb].4s\n"
-            "str    q22, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v22.4s, v7.4s, %[vb].4s\n"
+            "str	q22, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 15
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v23.4s, v7.4s, %[vb].4s\n"
-            "str    q23, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v23.4s, v7.4s, %[vb].4s\n"
+            "str	q23, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 16
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v24.4s, v7.4s, %[vb].4s\n"
-            "str    q24, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v24.4s, v7.4s, %[vb].4s\n"
+            "str	q24, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 17
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v25.4s, v7.4s, %[vb].4s\n"
-            "str    q25, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v25.4s, v7.4s, %[vb].4s\n"
+            "str	q25, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 18
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v26.4s, v7.4s, %[vb].4s\n"
-            "str    q26, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v26.4s, v7.4s, %[vb].4s\n"
+            "str	q26, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 19
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v27.4s, v7.4s, %[vb].4s\n"
-            "str    q27, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v27.4s, v7.4s, %[vb].4s\n"
+            "str	q27, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 20
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v28.4s, v7.4s, %[vb].4s\n"
-            "str    q28, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v28.4s, v7.4s, %[vb].4s\n"
+            "str	q28, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 21
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v29.4s, v7.4s, %[vb].4s\n"
-            "str    q29, [%[y_ptr]], #0x10\n"
-            "beq    12f\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v29.4s, v7.4s, %[vb].4s\n"
+            "str	q29, [%[y_ptr]], #0x10\n"
+            "beq	12f\n"
             // Vector 22
-            "subs    %w[vecs], %w[vecs], #1\n"
-            "ldr    q7, [%[y_ptr]]\n"
-            "fmla    v30.4s, v7.4s, %[vb].4s\n"
-            "str    q30, [%[y_ptr]], #0x10\n"
+            "subs	%w[vecs], %w[vecs], #1\n"
+            "ldr	q7, [%[y_ptr]]\n"
+            "fmla	v30.4s, v7.4s, %[vb].4s\n"
+            "str	q30, [%[y_ptr]], #0x10\n"
 
             // Odd 2
             "12:\n"
-            "cbz    %[odd2_aptr], 13f\n"
-            "ldr    d7, [%[y_ptr]]\n"
-            "fmla    v6.2s, v7.2s, %[vb].2s\n"
-            "str    d6, [%[y_ptr]], #0x8\n"
+            "cbz	%[odd2_aptr], 13f\n"
+            "ldr	d7, [%[y_ptr]]\n"
+            "fmla	v6.2s, v7.2s, %[vb].2s\n"
+            "str	d6, [%[y_ptr]], #0x8\n"
 
             // Odd 1
             "13:\n"
-            "cbz    %[odd1_aptr], 14f\n"
-            "ldr    s7, [%[y_ptr]]\n"
-            "fmla    v5.2s, v7.2s, %[vb].2s\n"
-            "str    s5, [%[y_ptr]]\n"
+            "cbz	%[odd1_aptr], 14f\n"
+            "ldr	s7, [%[y_ptr]]\n"
+            "fmla	v5.2s, v7.2s, %[vb].2s\n"
+            "str	s5, [%[y_ptr]]\n"
 
             "14:\n"
-            : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k),
-            [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr),
-            [odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr),
-            [dopf] "+r"(dopf), [vecs] "+r"(vecs)
-            : [jump] "r"(jump), [vb] "w"(vb), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
-            : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k),
+            [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr),
+            [odd1_aptr] "+r" (odd1_aptr), [odd2_aptr] "+r" (odd2_aptr),
+            [dopf] "+r" (dopf), [vecs] "+r" (vecs)
+          : [jump] "r" (jump), [vb] "w" (vb), [pf_limit] "r" (pf_limit), [numvecs] "r" (numvecs)
+          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
             "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc");
+            "v27", "v28", "v29", "v30", "v31", "cc"
+        );
     }
 }
 

diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
index 4a6da3d..b1e2ca1 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp

@@ -30,38 +30,39 @@
 #include "asmlib.hpp"
 #include "utils.hpp"
 
-namespace arm_gemm
-{
-template <unsigned int width, unsigned int height, typename Tin, typename Tout>
-inline void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta)
-{
+namespace arm_gemm {
+
+template<unsigned int width, unsigned int height, typename Tin, typename Tout>
+inline void MergeResults(Tout * out, const Tin * in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) {
     int full_y_blocks = (ymax - y0) / height;
-    int y_remainder   = (ymax - y0) % height;
-    int y_blocks      = full_y_blocks + (y_remainder ? 1 : 0);
+    int y_remainder = (ymax - y0) % height;
+    int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
 
     int full_x_blocks = (xmax - x0) / width;
-    int x_remainder   = (xmax - x0) % width;
-    int x_blocks      = full_x_blocks + (x_remainder ? 1 : 0);
+    int x_remainder = (xmax - x0) % width;
+    int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
 
-    for(int y_block = 0; y_block < y_blocks; y_block++)
-    {
+    for (int y_block = 0; y_block < y_blocks; y_block++) {
         int ybase = y0 + (y_block * height);
 
         int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
 
-        for(int x_block = 0; x_block < x_blocks; x_block++)
-        {
+        for (int x_block = 0; x_block < x_blocks; x_block++) {
             int xbase = x0 + (x_block * width);
 
             int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
 
-            for(int row = 0; row < fill_rows; row++)
-            {
-                for(int col = 0; col < fill_cols; col++)
-                {
+            for (int row=0; row < fill_rows; row++) {
+                for (int col=0; col < fill_cols; col++) {
                     Tout &p = out[(ybase + row) * ldc + xbase + col];
 
-                    p = (p * beta) + (alpha * in[row * width + col]);
+                    // Special case for beta==0 - don't read the input;
+                    // (0 * x == 0) is not always true for FP types.
+                    if (beta == static_cast<Tout>(0)) {
+                        p = (alpha * in[row * width + col]);
+                    } else {
+                        p = (p * beta) + (alpha * in[row * width + col]);
+                    }
                 }
             }
 

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index b44e564..2b83393 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp

@@ -27,9 +27,8 @@
 
 #include <arm_neon.h>
 
-template <>
-inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
-{
+template<>
+inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
     const float *inptr = in;
     prefetch_6x(inptr);
     prefetch_6x(inptr + 96);
@@ -37,8 +36,7 @@
     float32x4_t av = vdupq_n_f32(alpha);
     float32x4_t bv = vdupq_n_f32(beta);
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         float *outptr0 = out + (y * ldout) + x0;
         float *outptr1 = outptr0 + ldout;
         float *outptr2 = outptr1 + ldout;
@@ -53,17 +51,14 @@
         prefetch_2x(outptr4);
         prefetch_2x(outptr5);
 
-        for(int i = x0; i < xmax; i += 8)
-        {
+        for (int i=x0; i<xmax; i+=8) {
             float dummyres[8];
 
             /* Make sure we throw away results if Y isn't a multiple of 8.
              * We do this by pointing the result pointer at a dummy buffer
              * we later discard.  */
-            if((y + 5) >= ymax)
-            {
-                switch((y + 5) - ymax)
-                {
+            if ((y+5) >= ymax) {
+                switch ((y + 5) - ymax) {
                     case 4:
                         outptr1 = dummyres;
                     case 3:
@@ -81,84 +76,168 @@
                 }
             }
 
-            /* For ragged X, manually copy over the valid results. */
-            if((i + 7) >= xmax)
-            {
-                for(int xi = 0; xi < 8; xi++)
-                {
-                    if((i + xi) < xmax)
-                    {
-                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
-                        outptr0++;
-                        *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
-                        outptr1++;
-                        *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
-                        outptr2++;
-                        *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
-                        outptr3++;
-                        *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
-                        outptr4++;
-                        *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
-                        outptr5++;
+            if (beta == 0.0f) {
+                /* If beta=0, don't read the original input at all. */
+
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+7) >= xmax) {
+                    for (int xi=0; xi<8; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 8]);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 16]);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 24]);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 32]);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 40]);
+                            outptr5++;
+                        }
                     }
+                    inptr += 48;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+                        // Rows 0-1
+                        "VLD1.32	{d0-d3},   [%[inptr]]!\n"
+                        "VLD1.32	{d4-d7},   [%[inptr]]!\n"
+
+                        "VMUL.f32	q4, q0, %q[av]\n"
+                        ASM_PREFETCH("[%[inptr], #352]")
+                        "VMUL.f32	q5, q1, %q[av]\n"
+                        "VST1.32	{d8-d11}, [%[outptr0]]!\n"
+                        ASM_PREFETCH("[%[inptr], #416]")
+                        "VMUL.f32	q6, q2, %q[av]\n"
+                        ASM_PREFETCH("[%[inptr], #480]")
+                        "VMUL.f32	q7, q3, %q[av]\n"
+                        "VST1.32	{d12-d15}, [%[outptr1]]!\n"
+
+                        // Rows 2-3
+                        "VLD1.32	{d0-d3},   [%[inptr]]!\n"
+                        "VLD1.32	{d4-d7},   [%[inptr]]!\n"
+
+                        "VMUL.f32	q4, q0, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr0], #96]")
+                        "VMUL.f32	q5, q1, %q[av]\n"
+                        "VST1.32	{d8-d11}, [%[outptr2]]!\n"
+                        ASM_PREFETCH("[%[outptr1], #96]")
+                        "VMUL.f32	q6, q2, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr2], #96]")
+                        "VMUL.f32	q7, q3, %q[av]\n"
+                        "VST1.32	{d12-d15}, [%[outptr3]]!\n"
+
+                        // Rows 4-5
+                        "VLD1.32	{d0-d3},   [%[inptr]]!\n"
+                        "VLD1.32	{d4-d7},   [%[inptr]]!\n"
+
+                        "VMUL.f32	q4, q0, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr3], #96]")
+                        "VMUL.f32	q5, q1, %q[av]\n"
+                        "VST1.32	{d8-d11}, [%[outptr4]]!\n"
+                        ASM_PREFETCH("[%[outptr4], #96]")
+                        "VMUL.f32	q6, q2, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr5], #128]")
+                        "VMUL.f32	q7, q3, %q[av]\n"
+                        "VST1.32	{d12-d15}, [%[outptr5]]!\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
+                    : [av] "w" (av), [bv] "w" (bv)
+                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+                    );
                 }
-                inptr += 48;
-            }
-            else
-            {
-                /* Optimized routine to copy an entire block */
-                __asm __volatile(
-                    // Rows 0-1
-                    "VLD1.32    {d8-d11},  [%[outptr0]]\n"
-                    "VMUL.f32    q4, q4, %q[bv]\n"
-                    "VLD1.32    {d12-d15}, [%[outptr1]]\n"
-                    "VMUL.f32    q5, q5, %q[bv]\n"
-                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
-                    "VMUL.f32    q6, q6, %q[bv]\n"
-                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
-                    "VMUL.f32    q7, q7, %q[bv]\n"
+            } else {
+                /* Non-zero beta: Read output and apply beta. */
 
-                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[inptr], #352]")
-                    "VMLA.f32    q5, q1, %q[av]\n"
-                    "VST1.32    {d8-d11}, [%[outptr0]]!\n" ASM_PREFETCH("[%[inptr], #416]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[inptr], #480]")
-                    "VMLA.f32    q7, q3, %q[av]\n"
-                    "VST1.32    {d12-d15}, [%[outptr1]]!\n"
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+7) >= xmax) {
+                    for (int xi=0; xi<8; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
+                            outptr5++;
+                        }
+                    }
+                    inptr += 48;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+                        // Rows 0-1
+                        "VLD1.32	{d8-d11},  [%[outptr0]]\n"
+                        "VMUL.f32	q4, q4, %q[bv]\n"
+                        "VLD1.32	{d12-d15}, [%[outptr1]]\n"
+                        "VMUL.f32	q5, q5, %q[bv]\n"
+                        "VLD1.32	{d0-d3},   [%[inptr]]!\n"
+                        "VMUL.f32	q6, q6, %q[bv]\n"
+                        "VLD1.32	{d4-d7},   [%[inptr]]!\n"
+                        "VMUL.f32	q7, q7, %q[bv]\n"
 
-                    // Rows 2-3
-                    "VLD1.32    {d8-d11},  [%[outptr2]]\n"
-                    "VMUL.f32    q4, q4, %q[bv]\n"
-                    "VLD1.32    {d12-d15}, [%[outptr3]]\n"
-                    "VMUL.f32    q5, q5, %q[bv]\n"
-                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
-                    "VMUL.f32    q6, q6, %q[bv]\n"
-                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
-                    "VMUL.f32    q7, q7, %q[bv]\n"
+                        "VMLA.f32	q4, q0, %q[av]\n"
+                        ASM_PREFETCH("[%[inptr], #352]")
+                        "VMLA.f32	q5, q1, %q[av]\n"
+                        "VST1.32	{d8-d11}, [%[outptr0]]!\n"
+                        ASM_PREFETCH("[%[inptr], #416]")
+                        "VMLA.f32	q6, q2, %q[av]\n"
+                        ASM_PREFETCH("[%[inptr], #480]")
+                        "VMLA.f32	q7, q3, %q[av]\n"
+                        "VST1.32	{d12-d15}, [%[outptr1]]!\n"
 
-                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr0], #96]")
-                    "VMLA.f32    q5, q1, %q[av]\n"
-                    "VST1.32    {d8-d11}, [%[outptr2]]!\n" ASM_PREFETCH("[%[outptr1], #96]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr2], #96]")
-                    "VMLA.f32    q7, q3, %q[av]\n"
-                    "VST1.32    {d12-d15}, [%[outptr3]]!\n"
+                        // Rows 2-3
+                        "VLD1.32	{d8-d11},  [%[outptr2]]\n"
+                        "VMUL.f32	q4, q4, %q[bv]\n"
+                        "VLD1.32	{d12-d15}, [%[outptr3]]\n"
+                        "VMUL.f32	q5, q5, %q[bv]\n"
+                        "VLD1.32	{d0-d3},   [%[inptr]]!\n"
+                        "VMUL.f32	q6, q6, %q[bv]\n"
+                        "VLD1.32	{d4-d7},   [%[inptr]]!\n"
+                        "VMUL.f32	q7, q7, %q[bv]\n"
 
-                    // Rows 4-5
-                    "VLD1.32    {d8-d11},  [%[outptr4]]\n"
-                    "VMUL.f32    q4, q4, %q[bv]\n"
-                    "VLD1.32    {d12-d15}, [%[outptr5]]\n"
-                    "VMUL.f32    q5, q5, %q[bv]\n"
-                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
-                    "VMUL.f32    q6, q6, %q[bv]\n"
-                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
-                    "VMUL.f32    q7, q7, %q[bv]\n"
+                        "VMLA.f32	q4, q0, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr0], #96]")
+                        "VMLA.f32	q5, q1, %q[av]\n"
+                        "VST1.32	{d8-d11}, [%[outptr2]]!\n"
+                        ASM_PREFETCH("[%[outptr1], #96]")
+                        "VMLA.f32	q6, q2, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr2], #96]")
+                        "VMLA.f32	q7, q3, %q[av]\n"
+                        "VST1.32	{d12-d15}, [%[outptr3]]!\n"
 
-                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr3], #96]")
-                    "VMLA.f32    q5, q1, %q[av]\n"
-                    "VST1.32    {d8-d11}, [%[outptr4]]!\n" ASM_PREFETCH("[%[outptr4], #96]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr5], #128]")
-                    "VMLA.f32    q7, q3, %q[av]\n"
-                    "VST1.32    {d12-d15}, [%[outptr5]]!\n"
-                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
-                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [inptr] "+r"(inptr)
-                    : [av] "w"(av), [bv] "w"(bv)
-                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+                        // Rows 4-5
+                        "VLD1.32	{d8-d11},  [%[outptr4]]\n"
+                        "VMUL.f32	q4, q4, %q[bv]\n"
+                        "VLD1.32	{d12-d15}, [%[outptr5]]\n"
+                        "VMUL.f32	q5, q5, %q[bv]\n"
+                        "VLD1.32	{d0-d3},   [%[inptr]]!\n"
+                        "VMUL.f32	q6, q6, %q[bv]\n"
+                        "VLD1.32	{d4-d7},   [%[inptr]]!\n"
+                        "VMUL.f32	q7, q7, %q[bv]\n"
+
+                        "VMLA.f32	q4, q0, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr3], #96]")
+                        "VMLA.f32	q5, q1, %q[av]\n"
+                        "VST1.32	{d8-d11}, [%[outptr4]]!\n"
+                        ASM_PREFETCH("[%[outptr4], #96]")
+                        "VMLA.f32	q6, q2, %q[av]\n"
+                        ASM_PREFETCH("[%[outptr5], #128]")
+                        "VMLA.f32	q7, q3, %q[av]\n"
+                        "VST1.32	{d12-d15}, [%[outptr5]]!\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
+                    : [av] "w" (av), [bv] "w" (bv)
+                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+                    );
+                }
             }
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
index 3b59a43..f6befa2 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp

@@ -25,9 +25,8 @@
 
 #ifdef __aarch64__
 
-template <>
-inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
-{
+template<>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
     const float *inptr = in;
     prefetch_6x(inptr);
     prefetch_6x(inptr + 96);
@@ -35,8 +34,7 @@
     float32x4_t av = vdupq_n_f32(alpha);
     float32x4_t bv = vdupq_n_f32(beta);
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         float *outptr0 = out + (y * ldout) + x0;
         float *outptr1 = outptr0 + ldout;
         float *outptr2 = outptr1 + ldout;
@@ -55,17 +53,14 @@
         prefetch_2x(outptr6);
         prefetch_2x(outptr7);
 
-        for(int i = x0; i < xmax; i += 12)
-        {
+        for (int i=x0; i<xmax; i+=12) {
             float dummyres[12];
 
             /* Make sure we throw away results if Y isn't a multiple of 8.
              * We do this by pointing the result pointer at a dummy buffer
              * we later discard.  */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y+7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     case 6:
                         outptr1 = dummyres;
                     case 5:
@@ -87,147 +82,259 @@
                 }
             }
 
-            /* For ragged X, manually copy over the valid results. */
-            if((i + 11) >= xmax)
-            {
-                for(int xi = 0; xi < 12; xi++)
-                {
-                    if((i + xi) < xmax)
-                    {
-                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
-                        outptr0++;
-                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
-                        outptr1++;
-                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
-                        outptr2++;
-                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
-                        outptr3++;
-                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
-                        outptr4++;
-                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
-                        outptr5++;
-                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
-                        outptr6++;
-                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
-                        outptr7++;
+            if (beta==0.0f) {
+                /* If beta==0, don't read the original input at all. */
+
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+11) >= xmax) {
+                    for (int xi=0; xi<12; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 12]);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 24]);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 36]);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 48]);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 60]);
+                            outptr5++;
+                            *outptr6 = (alpha * inptr[xi + 72]);
+                            outptr6++;
+                            *outptr7 = (alpha * inptr[xi + 84]);
+                            outptr7++;
+                        }
                     }
+                    inptr += 96;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+                        // Rows 0-1
+                        "LDP	q0,  q1,  [%[inptr]]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #32]\n"
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #64]\n"
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr0]], #32\n"
+                        ASM_PREFETCH("[%[inptr], #768]")
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr0]], #16\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr1]], #32\n"
+                        ASM_PREFETCH("[%[inptr], #832]")
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr1]], #16\n"
+
+                        // Rows 2-3
+                        "LDP	q0,  q1,  [%[inptr], #96]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #128]\n"
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #160]\n"
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr2]], #32\n"
+                        ASM_PREFETCH("[%[inptr], #896]")
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr2]], #16\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr3]], #32\n"
+                        ASM_PREFETCH("[%[inptr], #1024]")
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr3]], #16\n"
+
+                        // Rows 4-5
+                        "LDP	q0,  q1,  [%[inptr], #192]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #224]\n"
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #256]\n"
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr4]], #32\n"
+                        ASM_PREFETCH("[%[inptr], #960]")
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr4]], #16\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr5]], #32\n"
+                        ASM_PREFETCH("[%[inptr], #1088]")
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr5]], #16\n"
+
+                        // Rows 6-7
+                        "LDP	q0,  q1,  [%[inptr], #288]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #320]\n"
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #352]\n"
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr6]], #32\n"
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr6]], #16\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr7]], #32\n"
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr7]], #16\n"
+                        "ADD	%[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                      [inptr] "+r" (inptr)
+                    : [av] "w" (av), [bv] "w" (bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
+                    );
                 }
-                inptr += 96;
-            }
-            else
-            {
-                /* Optimized routine to copy an entire block */
-                __asm __volatile(
-                    // Rows 0-1
-                    "LDP    q16, q17, [%[outptr0]]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR    q18, [%[outptr0], #32]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP    q19, q20, [%[outptr1]]\n"
-                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR    q21, [%[outptr1], #32]\n" ASM_PREFETCH("[%[inptr], #768]")
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr]]\n"
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP    q2,  q3,  [%[inptr], #32]\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP    q4,  q5,  [%[inptr], #64]\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #832]")
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "STP    q16, q17, [%[outptr0]], #32\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q18, [%[outptr0]], #16\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #896]")
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "STP    q19, q20, [%[outptr1]], #32\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "STR    q21, [%[outptr1]], #16\n"
+            } else {
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+11) >= xmax) {
+                    for (int xi=0; xi<12; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                            outptr5++;
+                            *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                            outptr6++;
+                            *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                            outptr7++;
+                        }
+                    }
+                    inptr += 96;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+                        // Rows 0-1
+                        "LDP	q16, q17, [%[outptr0]]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDR	q18, [%[outptr0], #32]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDP	q19, q20, [%[outptr1]]\n"
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        "LDR	q21, [%[outptr1], #32]\n"
+                        ASM_PREFETCH("[%[inptr], #768]")
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr]]\n"
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #32]\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #64]\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #832]")
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr0]], #32\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr0]], #16\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #896]")
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr1]], #32\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr1]], #16\n"
 
-                    // Rows 2-3
-                    "LDP    q16, q17, [%[outptr2]]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR    q18, [%[outptr2], #32]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP    q19, q20, [%[outptr3]]\n"
-                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR    q21, [%[outptr3], #32]\n" ASM_PREFETCH("[%[inptr], #960]")
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr], #96]\n"
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP    q2,  q3,  [%[inptr], #128]\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP    q4,  q5,  [%[inptr], #160]\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1024]")
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "STP    q16, q17, [%[outptr2]], #32\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q18, [%[outptr2]], #16\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1088]")
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "STP    q19, q20, [%[outptr3]], #32\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "STR    q21, [%[outptr3]], #16\n"
+                        // Rows 2-3
+                        "LDP	q16, q17, [%[outptr2]]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDR	q18, [%[outptr2], #32]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDP	q19, q20, [%[outptr3]]\n"
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        "LDR	q21, [%[outptr3], #32]\n"
+                        ASM_PREFETCH("[%[inptr], #960]")
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr], #96]\n"
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #128]\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #160]\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #1024]")
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr2]], #32\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr2]], #16\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #1088]")
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr3]], #32\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr3]], #16\n"
 
-                    // Rows 4-5
-                    ASM_PREFETCH("[%[outptr0], #80]")
-                    "LDP    q16, q17, [%[outptr4]]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR    q18, [%[outptr4], #32]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP    q19, q20, [%[outptr5]]\n"
-                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR    q21, [%[outptr5], #32]\n" ASM_PREFETCH("[%[outptr1], #80]")
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr], #192]\n"
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP    q2,  q3,  [%[inptr], #224]\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP    q4,  q5,  [%[inptr], #256]\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr2], #80]")
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "STP    q16, q17, [%[outptr4]], #32\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q18, [%[outptr4]], #16\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr3], #80]")
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "STP    q19, q20, [%[outptr5]], #32\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "STR    q21, [%[outptr5]], #16\n"
+                        // Rows 4-5
+                        ASM_PREFETCH("[%[outptr0], #80]")
+                        "LDP	q16, q17, [%[outptr4]]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDR	q18, [%[outptr4], #32]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDP	q19, q20, [%[outptr5]]\n"
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        "LDR	q21, [%[outptr5], #32]\n"
+                        ASM_PREFETCH("[%[outptr1], #80]")
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr], #192]\n"
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #224]\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #256]\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[outptr2], #80]")
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr4]], #32\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr4]], #16\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[outptr3], #80]")
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr5]], #32\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr5]], #16\n"
 
-                    // Rows 6-7
-                    ASM_PREFETCH("[%[outptr4], #80]")
-                    "LDP    q16, q17, [%[outptr6]]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDR    q18, [%[outptr6], #32]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDP    q19, q20, [%[outptr7]]\n"
-                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
-                    "LDR    q21, [%[outptr7], #32]\n" ASM_PREFETCH("[%[outptr5], #80]")
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr], #288]\n"
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
-                    "LDP    q2,  q3,  [%[inptr], #320]\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "LDP    q4,  q5,  [%[inptr], #352]\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr6], #128]")
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "STP    q16, q17, [%[outptr6]], #32\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q18, [%[outptr6]], #16\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr7], #128]")
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "STP    q19, q20, [%[outptr7]], #32\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "STR    q21, [%[outptr7]], #16\n"
-                    "ADD    %[inptr], %[inptr], #384\n"
-                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
-                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
-                    [inptr] "+r"(inptr)
-                    : [av] "w"(av), [bv] "w"(bv)
-                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+                        // Rows 6-7
+                        ASM_PREFETCH("[%[outptr4], #80]")
+                        "LDP	q16, q17, [%[outptr6]]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDR	q18, [%[outptr6], #32]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDP	q19, q20, [%[outptr7]]\n"
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        "LDR	q21, [%[outptr7], #32]\n"
+                        ASM_PREFETCH("[%[outptr5], #80]")
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr], #288]\n"
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        "LDP	q2,  q3,  [%[inptr], #320]\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "LDP	q4,  q5,  [%[inptr], #352]\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[outptr6], #128]")
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "STP	q16, q17, [%[outptr6]], #32\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q18, [%[outptr6]], #16\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[outptr7], #128]")
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "STP	q19, q20, [%[outptr7]], #32\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "STR	q21, [%[outptr7]], #16\n"
+                        "ADD	%[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                      [inptr] "+r" (inptr)
+                    : [av] "w" (av), [bv] "w" (bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
+                    );
+                }
             }
         }
     }
 }
 
-#endif // __aarch64__
\ No newline at end of file
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
index 9708fe1..e7a7521 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp

@@ -28,9 +28,8 @@
 
 #include <arm_neon.h>
 
-template <>
-inline void MergeResults<12, 8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta)
-{
+template<>
+inline void MergeResults<12,8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta) {
     const float *inptr = in;
     prefetch_6x(inptr);
     prefetch_6x(inptr + 24);
@@ -38,8 +37,7 @@
     float32x4_t av = vdupq_n_f32(alpha);
     float32x4_t bv = vdupq_n_f32(beta);
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         __fp16 *outptr0 = out + (y * ldout) + x0;
         __fp16 *outptr1 = outptr0 + ldout;
         __fp16 *outptr2 = outptr1 + ldout;
@@ -58,17 +56,14 @@
         prefetch_2x(outptr6);
         prefetch_2x(outptr7);
 
-        for(int i = x0; i < xmax; i += 12)
-        {
+        for (int i=x0; i<xmax; i+=12) {
             __fp16 dummyres[12];
 
             /* Make sure we throw away results if Y isn't a multiple of 8.
              * We do this by pointing the result pointer at a dummy buffer
              * we later discard.  */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y+7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     case 6:
                         outptr1 = dummyres;
                     case 5:
@@ -90,182 +85,335 @@
                 }
             }
 
-            /* For ragged X, manually copy over the valid results. */
-            if((i + 11) >= xmax)
-            {
-                for(int xi = 0; xi < 12; xi++)
-                {
-                    if((i + xi) < xmax)
-                    {
-                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
-                        outptr0++;
-                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
-                        outptr1++;
-                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
-                        outptr2++;
-                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
-                        outptr3++;
-                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
-                        outptr4++;
-                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
-                        outptr5++;
-                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
-                        outptr6++;
-                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
-                        outptr7++;
+            if (beta == ((__fp16)0.0f)) {
+                /* If beta==0, don't read the output. */
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+11) >= xmax) {
+                    for (int xi=0; xi<12; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 12]);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 24]);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 36]);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 48]);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 60]);
+                            outptr5++;
+                            *outptr6 = (alpha * inptr[xi + 72]);
+                            outptr6++;
+                            *outptr7 = (alpha * inptr[xi + 84]);
+                            outptr7++;
+                        }
                     }
+                    inptr += 96;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+                        // Rows 0-1
+                        "LDP	q0,  q1,  [%[inptr]]\n"
+                        "LDP	q2,  q3,  [%[inptr], #32]\n"
+                        "LDP	q4,  q5,  [%[inptr], #64]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #768]")
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #832]")
+                        "FCVTN	v16.4h, v16.4s\n"
+                        ASM_PREFETCH("[%[inptr], #896]")
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        ASM_PREFETCH("[%[inptr], #960]")
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr0]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr0]], #8\n"
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr1]], #16\n"
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr1]], #8\n"
+
+                        // Rows 2-3
+                        "LDP	q0,  q1,  [%[inptr], #96]\n"
+                        "LDP	q2,  q3,  [%[inptr], #128]\n"
+                        "LDP	q4,  q5,  [%[inptr], #160]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #1024]")
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[inptr], #1088]")
+                        "FCVTN	v16.4h, v16.4s\n"
+                        ASM_PREFETCH("[%[outptr0], #64]")
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        ASM_PREFETCH("[%[outptr1], #64]")
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr2]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr2]], #8\n"
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr3]], #16\n"
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr3]], #8\n"
+
+                        // Rows 4-5
+                        "LDP	q0,  q1,  [%[inptr], #192]\n"
+                        "LDP	q2,  q3,  [%[inptr], #224]\n"
+                        "LDP	q4,  q5,  [%[inptr], #256]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[outptr2], #64]")
+                        "FCVTN	v16.4h, v16.4s\n"
+                        ASM_PREFETCH("[%[outptr3], #64]")
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        ASM_PREFETCH("[%[outptr4], #88]")
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr4]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr4]], #8\n"
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr5]], #16\n"
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr5]], #8\n"
+
+                        // Rows 6-7
+                        "LDP	q0,  q1,  [%[inptr], #288]\n"
+                        "LDP	q2,  q3,  [%[inptr], #320]\n"
+                        "LDP	q4,  q5,  [%[inptr], #352]\n"
+                        "FMUL	v16.4s, v0.4s, %[av].4s\n"
+                        "FMUL	v17.4s, v1.4s, %[av].4s\n"
+                        ASM_PREFETCH("[%[outptr5], #64]")
+                        "FCVTN	v16.4h, v16.4s\n"
+                        ASM_PREFETCH("[%[outptr6], #88]")
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        ASM_PREFETCH("[%[outptr7], #88]")
+                        "FMUL	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr6]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr6]], #8\n"
+                        "FMUL	v19.4s, v3.4s, %[av].4s\n"
+                        "FMUL	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr7]], #16\n"
+                        "FMUL	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr7]], #8\n"
+                        "ADD	%[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                      [inptr] "+r" (inptr)
+                    : [av] "w" (av), [bv] "w" (bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
+                    );
                 }
-                inptr += 96;
-            }
-            else
-            {
-                /* Optimized routine to copy an entire block */
-                __asm __volatile(
-                    // Rows 0-1
-                    "LDR    q16, [%[outptr0]]\n"
-                    "FCVTL2    v17.4s, v16.8h\n"
-                    "LDR    d18, [%[outptr0], #16]\n"
-                    "FCVTL    v16.4s, v16.4h\n"
-                    "LDR    q19, [%[outptr1]]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDR    d21, [%[outptr1], #16]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr]]\n"
-                    "FCVTL    v18.4s, v18.4h\n"
-                    "LDP    q2,  q3,  [%[inptr], #32]\n"
-                    "FCVTL2    v20.4s, v19.8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #64]\n"
-                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #768]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #832]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #896]")
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #960]")
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "FCVTN    v16.4h, v16.4s\n"
-                    "FCVTN2    v16.8h, v17.4s\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q16, [%[outptr0]], #16\n"
-                    "FCVTN    v18.4h, v18.4s\n"
-                    "STR    d18, [%[outptr0]], #8\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "FCVTN    v19.4h, v19.4s\n"
-                    "FCVTN2    v19.8h, v20.4s\n"
-                    "STR    q19, [%[outptr1]], #16\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "FCVTN    v21.4h, v21.4s\n"
-                    "STR    d21, [%[outptr1]], #8\n"
+            } else {
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+11) >= xmax) {
+                    for (int xi=0; xi<12; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                            outptr5++;
+                            *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                            outptr6++;
+                            *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                            outptr7++;
+                        }
+                    }
+                    inptr += 96;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+                        // Rows 0-1
+                        "LDR	q16, [%[outptr0]]\n"
+                        "FCVTL2	v17.4s, v16.8h\n"
+                        "LDR	d18, [%[outptr0], #16]\n"
+                        "FCVTL	v16.4s, v16.4h\n"
+                        "LDR	q19, [%[outptr1]]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDR	d21, [%[outptr1], #16]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr]]\n"
+                        "FCVTL	v18.4s, v18.4h\n"
+                        "LDP	q2,  q3,  [%[inptr], #32]\n"
+                        "FCVTL2	v20.4s, v19.8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #64]\n"
+                        "FCVTL	v19.4s, v19.4h\n"
+                        ASM_PREFETCH("[%[inptr], #768]")
+                        "FCVTL	v21.4s, v21.4h\n"
+                        ASM_PREFETCH("[%[inptr], #832]")
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        ASM_PREFETCH("[%[inptr], #896]")
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        ASM_PREFETCH("[%[inptr], #960]")
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "FCVTN	v16.4h, v16.4s\n"
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr0]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr0]], #8\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr1]], #16\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr1]], #8\n"
 
-                    // Rows 2-3
-                    "LDR    q16, [%[outptr2]]\n"
-                    "FCVTL2    v17.4s, v16.8h\n"
-                    "LDR    d18, [%[outptr2], #16]\n"
-                    "FCVTL    v16.4s, v16.4h\n"
-                    "LDR    q19, [%[outptr3]]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDR    d21, [%[outptr3], #16]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr], #96]\n"
-                    "FCVTL    v18.4s, v18.4h\n"
-                    "LDP    q2,  q3,  [%[inptr], #128]\n"
-                    "FCVTL2    v20.4s, v19.8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #160]\n"
-                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #1024]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #1088]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr0], #64]")
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr1], #64]")
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "FCVTN    v16.4h, v16.4s\n"
-                    "FCVTN2    v16.8h, v17.4s\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q16, [%[outptr2]], #16\n"
-                    "FCVTN    v18.4h, v18.4s\n"
-                    "STR    d18, [%[outptr2]], #8\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "FCVTN    v19.4h, v19.4s\n"
-                    "FCVTN2    v19.8h, v20.4s\n"
-                    "STR    q19, [%[outptr3]], #16\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "FCVTN    v21.4h, v21.4s\n"
-                    "STR    d21, [%[outptr3]], #8\n"
+                        // Rows 2-3
+                        "LDR	q16, [%[outptr2]]\n"
+                        "FCVTL2	v17.4s, v16.8h\n"
+                        "LDR	d18, [%[outptr2], #16]\n"
+                        "FCVTL	v16.4s, v16.4h\n"
+                        "LDR	q19, [%[outptr3]]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDR	d21, [%[outptr3], #16]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr], #96]\n"
+                        "FCVTL	v18.4s, v18.4h\n"
+                        "LDP	q2,  q3,  [%[inptr], #128]\n"
+                        "FCVTL2	v20.4s, v19.8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #160]\n"
+                        "FCVTL	v19.4s, v19.4h\n"
+                        ASM_PREFETCH("[%[inptr], #1024]")
+                        "FCVTL	v21.4s, v21.4h\n"
+                        ASM_PREFETCH("[%[inptr], #1088]")
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        ASM_PREFETCH("[%[outptr0], #64]")
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        ASM_PREFETCH("[%[outptr1], #64]")
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "FCVTN	v16.4h, v16.4s\n"
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr2]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr2]], #8\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr3]], #16\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr3]], #8\n"
 
-                    // Rows 4-5
-                    "LDR    q16, [%[outptr4]]\n"
-                    "FCVTL2    v17.4s, v16.8h\n"
-                    "LDR    d18, [%[outptr4], #16]\n"
-                    "FCVTL    v16.4s, v16.4h\n"
-                    "LDR    q19, [%[outptr5]]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDR    d21, [%[outptr5], #16]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr], #192]\n"
-                    "FCVTL    v18.4s, v18.4h\n"
-                    "LDP    q2,  q3,  [%[inptr], #224]\n"
-                    "FCVTL2    v20.4s, v19.8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #256]\n"
-                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr2], #64]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr3], #64]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr4], #88]")
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "FCVTN    v16.4h, v16.4s\n"
-                    "FCVTN2    v16.8h, v17.4s\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q16, [%[outptr4]], #16\n"
-                    "FCVTN    v18.4h, v18.4s\n"
-                    "STR    d18, [%[outptr4]], #8\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "FCVTN    v19.4h, v19.4s\n"
-                    "FCVTN2    v19.8h, v20.4s\n"
-                    "STR    q19, [%[outptr5]], #16\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "FCVTN    v21.4h, v21.4s\n"
-                    "STR    d21, [%[outptr5]], #8\n"
+                        // Rows 4-5
+                        "LDR	q16, [%[outptr4]]\n"
+                        "FCVTL2	v17.4s, v16.8h\n"
+                        "LDR	d18, [%[outptr4], #16]\n"
+                        "FCVTL	v16.4s, v16.4h\n"
+                        "LDR	q19, [%[outptr5]]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDR	d21, [%[outptr5], #16]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr], #192]\n"
+                        "FCVTL	v18.4s, v18.4h\n"
+                        "LDP	q2,  q3,  [%[inptr], #224]\n"
+                        "FCVTL2	v20.4s, v19.8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #256]\n"
+                        "FCVTL	v19.4s, v19.4h\n"
+                        ASM_PREFETCH("[%[outptr2], #64]")
+                        "FCVTL	v21.4s, v21.4h\n"
+                        ASM_PREFETCH("[%[outptr3], #64]")
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        ASM_PREFETCH("[%[outptr4], #88]")
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "FCVTN	v16.4h, v16.4s\n"
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr4]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr4]], #8\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr5]], #16\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr5]], #8\n"
 
-                    // Rows 6-7
-                    "LDR    q16, [%[outptr6]]\n"
-                    "FCVTL2    v17.4s, v16.8h\n"
-                    "LDR    d18, [%[outptr6], #16]\n"
-                    "FCVTL    v16.4s, v16.4h\n"
-                    "LDR    q19, [%[outptr7]]\n"
-                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
-                    "LDR    d21, [%[outptr7], #16]\n"
-                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
-                    "LDP    q0,  q1,  [%[inptr], #288]\n"
-                    "FCVTL    v18.4s, v18.4h\n"
-                    "LDP    q2,  q3,  [%[inptr], #320]\n"
-                    "FCVTL2    v20.4s, v19.8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #352]\n"
-                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr5], #64]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr6], #88]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr7], #88]")
-                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
-                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
-                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
-                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
-                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
-                    "FCVTN    v16.4h, v16.4s\n"
-                    "FCVTN2    v16.8h, v17.4s\n"
-                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
-                    "STR    q16, [%[outptr6]], #16\n"
-                    "FCVTN    v18.4h, v18.4s\n"
-                    "STR    d18, [%[outptr6]], #8\n"
-                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
-                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
-                    "FCVTN    v19.4h, v19.4s\n"
-                    "FCVTN2    v19.8h, v20.4s\n"
-                    "STR    q19, [%[outptr7]], #16\n"
-                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
-                    "FCVTN    v21.4h, v21.4s\n"
-                    "STR    d21, [%[outptr7]], #8\n"
-                    "ADD    %[inptr], %[inptr], #384\n"
-                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
-                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
-                    [inptr] "+r"(inptr)
-                    : [av] "w"(av), [bv] "w"(bv)
-                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+                        // Rows 6-7
+                        "LDR	q16, [%[outptr6]]\n"
+                        "FCVTL2	v17.4s, v16.8h\n"
+                        "LDR	d18, [%[outptr6], #16]\n"
+                        "FCVTL	v16.4s, v16.4h\n"
+                        "LDR	q19, [%[outptr7]]\n"
+                        "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                        "LDR	d21, [%[outptr7], #16]\n"
+                        "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                        "LDP	q0,  q1,  [%[inptr], #288]\n"
+                        "FCVTL	v18.4s, v18.4h\n"
+                        "LDP	q2,  q3,  [%[inptr], #320]\n"
+                        "FCVTL2	v20.4s, v19.8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #352]\n"
+                        "FCVTL	v19.4s, v19.4h\n"
+                        ASM_PREFETCH("[%[outptr5], #64]")
+                        "FCVTL	v21.4s, v21.4h\n"
+                        ASM_PREFETCH("[%[outptr6], #88]")
+                        "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                        ASM_PREFETCH("[%[outptr7], #88]")
+                        "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                        "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                        "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                        "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                        "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                        "FCVTN	v16.4h, v16.4s\n"
+                        "FCVTN2	v16.8h, v17.4s\n"
+                        "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                        "STR	q16, [%[outptr6]], #16\n"
+                        "FCVTN	v18.4h, v18.4s\n"
+                        "STR	d18, [%[outptr6]], #8\n"
+                        "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                        "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                        "FCVTN	v19.4h, v19.4s\n"
+                        "FCVTN2	v19.8h, v20.4s\n"
+                        "STR	q19, [%[outptr7]], #16\n"
+                        "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                        "FCVTN	v21.4h, v21.4s\n"
+                        "STR	d21, [%[outptr7]], #8\n"
+                        "ADD	%[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                      [inptr] "+r" (inptr)
+                    : [av] "w" (av), [bv] "w" (bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
+                    );
+                }
             }
         }
     }

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
index 08cfc00..3ed43b1 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp

@@ -23,12 +23,12 @@
  */
 #pragma once
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+// AArch64 only, and either the FP16_KERNELS option set or the target explicitly supports FP16 vectors.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
 
-template <>
+template<>
 inline void MergeResults<24, 8>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax,
-                                const int x0, const int xmax, const __fp16 alpha, const __fp16 beta)
-{
+                         const int x0, const int xmax, const __fp16 alpha, const __fp16 beta) {
     const __fp16 *inptr = in;
     prefetch_6x(inptr);
     prefetch_6x(inptr + 48);
@@ -36,8 +36,7 @@
     float16x8_t va = vdupq_n_f16(alpha);
     float16x8_t vb = vdupq_n_f16(beta);
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         __fp16 *outptr0 = out + (y * ldout) + x0;
         __fp16 *outptr1 = outptr0 + ldout;
         __fp16 *outptr2 = outptr1 + ldout;
@@ -56,17 +55,14 @@
         prefetch_2x(outptr6);
         prefetch_2x(outptr7);
 
-        for(int i = x0; i < xmax; i += 24)
-        {
+        for (int i=x0; i<xmax; i+=24) {
             __fp16 dummyres[24];
 
             /* Make sure we throw away results if Y isn't a multiple of 8.
              * We do this by pointing the result pointer at a dummy buffer
              * we later discard.  */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y+7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     case 6:
                         outptr1 = dummyres;
                     case 5:
@@ -85,149 +81,277 @@
 
                     default:
                         UNREACHABLE("Impossible.");
+
                 }
             }
 
-            /* For ragged X, manually copy over the valid results. */
-            if((i + 23) >= xmax)
-            {
-                for(int xi = 0; xi < 24; xi++)
-                {
-                    if((i + xi) < xmax)
-                    {
-                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
-                        outptr0++;
-                        *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta);
-                        outptr1++;
-                        *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta);
-                        outptr2++;
-                        *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta);
-                        outptr3++;
-                        *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta);
-                        outptr4++;
-                        *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta);
-                        outptr5++;
-                        *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta);
-                        outptr6++;
-                        *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta);
-                        outptr7++;
+            if (beta == (__fp16)0.0f) {
+                /* If beta===0, don't read the output. */
+
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+23) >= xmax) {
+                    for (int xi=0; xi<24; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 24]);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 48]);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 72]);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 96]);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 120]);
+                            outptr5++;
+                            *outptr6 = (alpha * inptr[xi + 144]);
+                            outptr6++;
+                            *outptr7 = (alpha * inptr[xi + 168]);
+                            outptr7++;
+                        }
                     }
+                    inptr += 192;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                        ".arch	armv8.2-a+fp16\n"
+#endif
+                        // Rows 0-1
+                        ASM_PREFETCH("[%[inptr], #768]")
+                        "LDP	q0,  q1,  [%[inptr]]\n"
+                        "LDP	q2,  q3,  [%[inptr], #32]\n"
+                        "LDP	q4,  q5,  [%[inptr], #64]\n"
+                        "FMUL	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[inptr], #832]")
+                        "FMUL	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr0]], #32\n"
+                        "FMUL	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr0]], #16\n"
+                        "FMUL	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[inptr], #896]")
+                        "FMUL	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr1]], #32\n"
+                        "FMUL	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr1]], #16\n"
+                        ASM_PREFETCH("[%[inptr], #960]")
+
+                        // Rows 2-3
+                        ASM_PREFETCH("[%[inptr], #1024]")
+                        "LDP	q0,  q1,  [%[inptr], #96]\n"
+                        "LDP	q2,  q3,  [%[inptr], #128]\n"
+                        "LDP	q4,  q5,  [%[inptr], #160]\n"
+                        "FMUL	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[inptr], #1088]")
+                        "FMUL	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr2]], #32\n"
+                        "FMUL	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr2]], #16\n"
+                        "FMUL	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr0], #80]")
+                        "FMUL	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr3]], #32\n"
+                        "FMUL	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr3]], #16\n"
+                        ASM_PREFETCH("[%[outptr1], #80]")
+
+                        // Rows 4-5
+                        ASM_PREFETCH("[%[outptr2], #80]")
+                        "LDP	q0,  q1,  [%[inptr], #192]\n"
+                        "LDP	q2,  q3,  [%[inptr], #224]\n"
+                        "LDP	q4,  q5,  [%[inptr], #256]\n"
+                        "FMUL	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr3], #80]")
+                        "FMUL	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr4]], #32\n"
+                        "FMUL	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr4]], #16\n"
+                        "FMUL	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr4], #80]")
+                        "FMUL	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr5]], #32\n"
+                        "FMUL	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr5]], #16\n"
+
+                        // Rows 6-7
+                        ASM_PREFETCH("[%[outptr5], #80]")
+                        "LDP	q0,  q1,  [%[inptr], #288]\n"
+                        "LDP	q2,  q3,  [%[inptr], #320]\n"
+                        "LDP	q4,  q5,  [%[inptr], #352]\n"
+                        "FMUL	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr6], #128]")
+                        "FMUL	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr6]], #32\n"
+                        "FMUL	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr6]], #16\n"
+                        "FMUL	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr7], #128]")
+                        "FMUL	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr7]], #32\n"
+                        "FMUL	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr7]], #16\n"
+                        "ADD	%[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                      [inptr] "+r" (inptr)
+                    : [va] "w" (va), [vb] "w" (vb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
+                    );
                 }
-                inptr += 192;
-            }
-            else
-            {
-                /* Optimized routine to copy an entire block */
-                __asm __volatile(
-                    ".arch    armv8.2-a+fp16\n"
-                    // Rows 0-1
-                    "LDP    q16, q17, [%[outptr0]]\n"
-                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
-                    "LDR    q18, [%[outptr0], #32]\n"
-                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
-                    "LDP    q19, q20, [%[outptr1]]\n"
-                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #768]")
-                    "LDR    q21, [%[outptr1], #32]\n"
-                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
-                    "LDP    q0,  q1,  [%[inptr]]\n"
-                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
-                    "LDP    q2,  q3,  [%[inptr], #32]\n"
-                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #64]\n"
-                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #832]")
-                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
-                    "STP    q16, q17, [%[outptr0]], #32\n"
-                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
-                    "STR    q18, [%[outptr0]], #16\n"
-                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #896]")
-                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
-                    "STP    q19, q20, [%[outptr1]], #32\n"
-                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
-                    "STR    q21, [%[outptr1]], #16\n" ASM_PREFETCH("[%[inptr], #960]")
+            } else {
+                /* For ragged X, manually copy over the valid results. */
+                if ((i+23) >= xmax) {
+                    for (int xi=0; xi<24; xi++) {
+                        if ((i+xi) < xmax) {
+                            *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                            outptr0++;
+                            *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta);
+                            outptr1++;
+                            *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta);
+                            outptr2++;
+                            *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta);
+                            outptr3++;
+                            *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta);
+                            outptr4++;
+                            *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta);
+                            outptr5++;
+                            *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta);
+                            outptr6++;
+                            *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta);
+                            outptr7++;
+                        }
+                    }
+                    inptr += 192;
+                } else {
+                    /* Optimized routine to copy an entire block */
+                    __asm __volatile (
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                        ".arch	armv8.2-a+fp16\n"
+#endif
+                        // Rows 0-1
+                        "LDP	q16, q17, [%[outptr0]]\n"
+                        "FMUL	v16.8h, v16.8h, %[vb].8h\n"
+                        "LDR	q18, [%[outptr0], #32]\n"
+                        "FMUL	v17.8h, v17.8h, %[vb].8h\n"
+                        "LDP	q19, q20, [%[outptr1]]\n"
+                        "FMUL	v18.8h, v18.8h, %[vb].8h\n"
+                        ASM_PREFETCH("[%[inptr], #768]")
+                        "LDR	q21, [%[outptr1], #32]\n"
+                        "FMUL	v19.8h, v19.8h, %[vb].8h\n"
+                        "LDP	q0,  q1,  [%[inptr]]\n"
+                        "FMUL	v20.8h, v20.8h, %[vb].8h\n"
+                        "LDP	q2,  q3,  [%[inptr], #32]\n"
+                        "FMUL	v21.8h, v21.8h, %[vb].8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #64]\n"
+                        "FMLA	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[inptr], #832]")
+                        "FMLA	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr0]], #32\n"
+                        "FMLA	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr0]], #16\n"
+                        "FMLA	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[inptr], #896]")
+                        "FMLA	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr1]], #32\n"
+                        "FMLA	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr1]], #16\n"
+                        ASM_PREFETCH("[%[inptr], #960]")
 
-                    // Rows 2-3
-                    "LDP    q16, q17, [%[outptr2]]\n"
-                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
-                    "LDR    q18, [%[outptr2], #32]\n"
-                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
-                    "LDP    q19, q20, [%[outptr3]]\n"
-                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #1024]")
-                    "LDR    q21, [%[outptr3], #32]\n"
-                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
-                    "LDP    q0,  q1,  [%[inptr], #96]\n"
-                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
-                    "LDP    q2,  q3,  [%[inptr], #128]\n"
-                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #160]\n"
-                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #1088]")
-                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
-                    "STP    q16, q17, [%[outptr2]], #32\n"
-                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
-                    "STR    q18, [%[outptr2]], #16\n"
-                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr0], #80]")
-                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
-                    "STP    q19, q20, [%[outptr3]], #32\n"
-                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
-                    "STR    q21, [%[outptr3]], #16\n" ASM_PREFETCH("[%[outptr1], #80]")
+                        // Rows 2-3
+                        "LDP	q16, q17, [%[outptr2]]\n"
+                        "FMUL	v16.8h, v16.8h, %[vb].8h\n"
+                        "LDR	q18, [%[outptr2], #32]\n"
+                        "FMUL	v17.8h, v17.8h, %[vb].8h\n"
+                        "LDP	q19, q20, [%[outptr3]]\n"
+                        "FMUL	v18.8h, v18.8h, %[vb].8h\n"
+                        ASM_PREFETCH("[%[inptr], #1024]")
+                        "LDR	q21, [%[outptr3], #32]\n"
+                        "FMUL	v19.8h, v19.8h, %[vb].8h\n"
+                        "LDP	q0,  q1,  [%[inptr], #96]\n"
+                        "FMUL	v20.8h, v20.8h, %[vb].8h\n"
+                        "LDP	q2,  q3,  [%[inptr], #128]\n"
+                        "FMUL	v21.8h, v21.8h, %[vb].8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #160]\n"
+                        "FMLA	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[inptr], #1088]")
+                        "FMLA	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr2]], #32\n"
+                        "FMLA	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr2]], #16\n"
+                        "FMLA	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr0], #80]")
+                        "FMLA	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr3]], #32\n"
+                        "FMLA	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr3]], #16\n"
+                        ASM_PREFETCH("[%[outptr1], #80]")
 
-                    // Rows 4-5
-                    "LDP    q16, q17, [%[outptr4]]\n"
-                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
-                    "LDR    q18, [%[outptr4], #32]\n"
-                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
-                    "LDP    q19, q20, [%[outptr5]]\n"
-                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[outptr2], #80]")
-                    "LDR    q21, [%[outptr5], #32]\n"
-                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
-                    "LDP    q0,  q1,  [%[inptr], #192]\n"
-                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
-                    "LDP    q2,  q3,  [%[inptr], #224]\n"
-                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #256]\n"
-                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr3], #80]")
-                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
-                    "STP    q16, q17, [%[outptr4]], #32\n"
-                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
-                    "STR    q18, [%[outptr4]], #16\n"
-                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr4], #80]")
-                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
-                    "STP    q19, q20, [%[outptr5]], #32\n"
-                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
-                    "STR    q21, [%[outptr5]], #16\n"
+                        // Rows 4-5
+                        "LDP	q16, q17, [%[outptr4]]\n"
+                        "FMUL	v16.8h, v16.8h, %[vb].8h\n"
+                        "LDR	q18, [%[outptr4], #32]\n"
+                        "FMUL	v17.8h, v17.8h, %[vb].8h\n"
+                        "LDP	q19, q20, [%[outptr5]]\n"
+                        "FMUL	v18.8h, v18.8h, %[vb].8h\n"
+                        ASM_PREFETCH("[%[outptr2], #80]")
+                        "LDR	q21, [%[outptr5], #32]\n"
+                        "FMUL	v19.8h, v19.8h, %[vb].8h\n"
+                        "LDP	q0,  q1,  [%[inptr], #192]\n"
+                        "FMUL	v20.8h, v20.8h, %[vb].8h\n"
+                        "LDP	q2,  q3,  [%[inptr], #224]\n"
+                        "FMUL	v21.8h, v21.8h, %[vb].8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #256]\n"
+                        "FMLA	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr3], #80]")
+                        "FMLA	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr4]], #32\n"
+                        "FMLA	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr4]], #16\n"
+                        "FMLA	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr4], #80]")
+                        "FMLA	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr5]], #32\n"
+                        "FMLA	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr5]], #16\n"
 
-                    // Rows 6-7
-                    "LDP    q16, q17, [%[outptr6]]\n"
-                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
-                    "LDR    q18, [%[outptr6], #32]\n"
-                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
-                    "LDP    q19, q20, [%[outptr7]]\n" ASM_PREFETCH("[%[outptr5], #80]")
-                    "FMUL    v18.8h, v18.8h, %[vb].8h\n"
-                    "LDR    q21, [%[outptr7], #32]\n"
-                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
-                    "LDP    q0,  q1,  [%[inptr], #288]\n"
-                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
-                    "LDP    q2,  q3,  [%[inptr], #320]\n"
-                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
-                    "LDP    q4,  q5,  [%[inptr], #352]\n"
-                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr6], #128]")
-                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
-                    "STP    q16, q17, [%[outptr6]], #32\n"
-                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
-                    "STR    q18, [%[outptr6]], #16\n"
-                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr7], #128]")
-                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
-                    "STP    q19, q20, [%[outptr7]], #32\n"
-                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
-                    "STR    q21, [%[outptr7]], #16\n"
-                    "ADD    %[inptr], %[inptr], #384\n"
-                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
-                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
-                    [inptr] "+r"(inptr)
-                    : [va] "w"(va), [vb] "w"(vb)
-                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+                        // Rows 6-7
+                        "LDP	q16, q17, [%[outptr6]]\n"
+                        "FMUL	v16.8h, v16.8h, %[vb].8h\n"
+                        "LDR	q18, [%[outptr6], #32]\n"
+                        "FMUL	v17.8h, v17.8h, %[vb].8h\n"
+                        "LDP	q19, q20, [%[outptr7]]\n"
+                        ASM_PREFETCH("[%[outptr5], #80]")
+                        "FMUL	v18.8h, v18.8h, %[vb].8h\n"
+                        "LDR	q21, [%[outptr7], #32]\n"
+                        "FMUL	v19.8h, v19.8h, %[vb].8h\n"
+                        "LDP	q0,  q1,  [%[inptr], #288]\n"
+                        "FMUL	v20.8h, v20.8h, %[vb].8h\n"
+                        "LDP	q2,  q3,  [%[inptr], #320]\n"
+                        "FMUL	v21.8h, v21.8h, %[vb].8h\n"
+                        "LDP	q4,  q5,  [%[inptr], #352]\n"
+                        "FMLA	v16.8h, v0.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr6], #128]")
+                        "FMLA	v17.8h, v1.8h, %[va].8h\n"
+                        "STP	q16, q17, [%[outptr6]], #32\n"
+                        "FMLA	v18.8h, v2.8h, %[va].8h\n"
+                        "STR	q18, [%[outptr6]], #16\n"
+                        "FMLA	v19.8h, v3.8h, %[va].8h\n"
+                        ASM_PREFETCH("[%[outptr7], #128]")
+                        "FMLA	v20.8h, v4.8h, %[va].8h\n"
+                        "STP	q19, q20, [%[outptr7]], #32\n"
+                        "FMLA	v21.8h, v5.8h, %[va].8h\n"
+                        "STR	q21, [%[outptr7]], #16\n"
+                        "ADD	%[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                      [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                      [inptr] "+r" (inptr)
+                    : [va] "w" (va), [vb] "w" (vb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
+                    );
+                }
             }
         }
     }
 }
 
-#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
index 79dd1f0..1a51505 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp

@@ -25,18 +25,16 @@
 
 #ifdef __aarch64__
 
-template <>
-inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta)
-{
+template<>
+inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta) {
     const int32_t *inptr = in;
     prefetch_6x(inptr);
     prefetch_6x(inptr + 96);
 
     int32x4_t alpha_value = vdupq_n_s32(alpha);
-    int32x4_t beta_value  = vdupq_n_s32(beta);
+    int32x4_t beta_value = vdupq_n_s32(beta);
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         int32_t *outptr0 = out + (y * ldout) + x0;
         int32_t *outptr1 = outptr0 + ldout;
         int32_t *outptr2 = outptr1 + ldout;
@@ -55,17 +53,14 @@
         prefetch_2x(outptr6);
         prefetch_2x(outptr7);
 
-        for(int i = x0; i < xmax; i += 12)
-        {
+        for (int i=x0; i<xmax; i+=12) {
             int32_t dummyres[12];
 
             /* Make sure we throw away results if Y isn't a multiple of 8.
              * We do this by pointing the result pointer at a dummy buffer
              * we later discard.  */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y+7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     case 6:
                         outptr1 = dummyres;
                     case 5:
@@ -88,12 +83,9 @@
             }
 
             /* For ragged X, manually copy over the valid results. */
-            if((i + 11) >= xmax)
-            {
-                for(int xi = 0; xi < 12; xi++)
-                {
-                    if((i + xi) < xmax)
-                    {
+            if ((i+11) >= xmax) {
+                for (int xi=0; xi<12; xi++) {
+                    if ((i+xi) < xmax) {
                         *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
                         outptr0++;
                         *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
@@ -113,177 +105,175 @@
                     }
                 }
                 inptr += 96;
-            }
-            else
-            {
+            } else {
                 /* Optimized routine to copy an entire block */
-                __asm __volatile(
-                    // Row 0
-                    ASM_PREFETCH("[%x[outptr1], #192]")
-                    "ldr q3, [%x[outptr0]]\n"
-                    "ldr q4, [%x[outptr0], #0x10]\n"
-                    "ldr q5, [%x[outptr0], #0x20]\n"
-                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr]]\n"
-                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0x10]\n"
-                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0x20]\n"
-                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q0, [%x[outptr1]]\n"
-                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q1, [%x[outptr1], #0x10]\n"
-                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q2, [%x[outptr1], #0x20]\n"
+              __asm __volatile (
+                  // Row 0
+                  ASM_PREFETCH("[%x[outptr1], #192]")
+                  "ldr q3, [%x[outptr0]]\n"
+                  "ldr q4, [%x[outptr0], #0x10]\n"
+                  "ldr q5, [%x[outptr0], #0x20]\n"
+                  "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr]]\n"
+                  "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0x10]\n"
+                  "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0x20]\n"
+                  "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q0, [%x[outptr1]]\n"
+                  "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q1, [%x[outptr1], #0x10]\n"
+                  "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q2, [%x[outptr1], #0x20]\n"
 
-                    // Row 1
-                    ASM_PREFETCH("[%x[outptr2], #192]")
-                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0x30]\n"
-                    "str q3, [%x[outptr0]], #0x10\n"
-                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0x40]\n"
-                    "str q4, [%x[outptr0]], #0x10\n"
-                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0x50]\n"
-                    "str q5, [%x[outptr0]], #0x10\n"
-                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q3, [%x[outptr2]]\n"
-                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q4, [%x[outptr2], #0x10]\n"
-                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q5, [%x[outptr2], #0x20]\n"
+                  // Row 1
+                  ASM_PREFETCH("[%x[outptr2], #192]")
+                  "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0x30]\n"
+                  "str q3, [%x[outptr0]], #0x10\n"
+                  "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0x40]\n"
+                  "str q4, [%x[outptr0]], #0x10\n"
+                  "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0x50]\n"
+                  "str q5, [%x[outptr0]], #0x10\n"
+                  "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q3, [%x[outptr2]]\n"
+                  "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q4, [%x[outptr2], #0x10]\n"
+                  "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q5, [%x[outptr2], #0x20]\n"
 
-                    // Row 2
-                    ASM_PREFETCH("[%x[outptr3], #192]")
-                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0x60]\n"
-                    "str q0, [%x[outptr1]], #0x10\n"
-                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0x70]\n"
-                    "str q1, [%x[outptr1]], #0x10\n"
-                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0x80]\n"
-                    "str q2, [%x[outptr1]], #0x10\n"
-                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q0, [%x[outptr3]]\n"
-                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q1, [%x[outptr3], #0x10]\n"
-                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q2, [%x[outptr3], #0x20]\n"
+                  // Row 2
+                  ASM_PREFETCH("[%x[outptr3], #192]")
+                  "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0x60]\n"
+                  "str q0, [%x[outptr1]], #0x10\n"
+                  "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0x70]\n"
+                  "str q1, [%x[outptr1]], #0x10\n"
+                  "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0x80]\n"
+                  "str q2, [%x[outptr1]], #0x10\n"
+                  "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q0, [%x[outptr3]]\n"
+                  "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q1, [%x[outptr3], #0x10]\n"
+                  "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q2, [%x[outptr3], #0x20]\n"
 
-                    // Row 3
-                    ASM_PREFETCH("[%x[outptr4], #192]")
-                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0x90]\n"
-                    "str q3, [%x[outptr2]], #0x10\n"
-                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0xa0]\n"
-                    "str q4, [%x[outptr2]], #0x10\n"
-                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0xb0]\n"
-                    "str q5, [%x[outptr2]], #0x10\n"
-                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q3, [%x[outptr4]]\n"
-                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q4, [%x[outptr4], #0x10]\n"
-                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q5, [%x[outptr4], #0x20]\n"
+                  // Row 3
+                  ASM_PREFETCH("[%x[outptr4], #192]")
+                  "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0x90]\n"
+                  "str q3, [%x[outptr2]], #0x10\n"
+                  "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0xa0]\n"
+                  "str q4, [%x[outptr2]], #0x10\n"
+                  "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0xb0]\n"
+                  "str q5, [%x[outptr2]], #0x10\n"
+                  "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q3, [%x[outptr4]]\n"
+                  "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q4, [%x[outptr4], #0x10]\n"
+                  "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q5, [%x[outptr4], #0x20]\n"
 
-                    // Row 4
-                    ASM_PREFETCH("[%x[outptr5], #192]")
-                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0xc0]\n"
-                    "str q0, [%x[outptr3]], #0x10\n"
-                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0xd0]\n"
-                    "str q1, [%x[outptr3]], #0x10\n"
-                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0xe0]\n"
-                    "str q2, [%x[outptr3]], #0x10\n"
-                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q0, [%x[outptr5]]\n"
-                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q1, [%x[outptr5], #0x10]\n"
-                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q2, [%x[outptr5], #0x20]\n"
+                  // Row 4
+                  ASM_PREFETCH("[%x[outptr5], #192]")
+                  "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0xc0]\n"
+                  "str q0, [%x[outptr3]], #0x10\n"
+                  "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0xd0]\n"
+                  "str q1, [%x[outptr3]], #0x10\n"
+                  "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0xe0]\n"
+                  "str q2, [%x[outptr3]], #0x10\n"
+                  "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q0, [%x[outptr5]]\n"
+                  "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q1, [%x[outptr5], #0x10]\n"
+                  "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q2, [%x[outptr5], #0x20]\n"
 
-                    // Row 5
-                    ASM_PREFETCH("[%x[outptr6], #192]")
-                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0xf0]\n"
-                    "str q3, [%x[outptr4]], #0x10\n"
-                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0x100]\n"
-                    "str q4, [%x[outptr4]], #0x10\n"
-                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0x110]\n"
-                    "str q5, [%x[outptr4]], #0x10\n"
-                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q3, [%x[outptr6]]\n"
-                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q4, [%x[outptr6], #0x10]\n"
-                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q5, [%x[outptr6], #0x20]\n"
+                  // Row 5
+                  ASM_PREFETCH("[%x[outptr6], #192]")
+                  "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0xf0]\n"
+                  "str q3, [%x[outptr4]], #0x10\n"
+                  "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0x100]\n"
+                  "str q4, [%x[outptr4]], #0x10\n"
+                  "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0x110]\n"
+                  "str q5, [%x[outptr4]], #0x10\n"
+                  "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q3, [%x[outptr6]]\n"
+                  "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q4, [%x[outptr6], #0x10]\n"
+                  "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q5, [%x[outptr6], #0x20]\n"
 
-                    // Row 6
-                    ASM_PREFETCH("[%x[outptr7], #192]")
-                    "mul v3.4s, v3.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0x120]\n"
-                    "str q0, [%x[outptr5]], #0x10\n"
-                    "mul v4.4s, v4.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0x130]\n"
-                    "str q1, [%x[outptr5]], #0x10\n"
-                    "mul v5.4s, v5.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0x140]\n"
-                    "str q2, [%x[outptr5]], #0x10\n"
-                    "mla v3.4s, v6.4s, %[alpha_value].4s\n"
-                    "ldr q0, [%x[outptr7]]\n"
-                    "mla v4.4s, v7.4s, %[alpha_value].4s\n"
-                    "ldr q1, [%x[outptr7], #0x10]\n"
-                    "mla v5.4s, v8.4s, %[alpha_value].4s\n"
-                    "ldr q2, [%x[outptr7], #0x20]\n"
+                  // Row 6
+                  ASM_PREFETCH("[%x[outptr7], #192]")
+                  "mul v3.4s, v3.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0x120]\n"
+                  "str q0, [%x[outptr5]], #0x10\n"
+                  "mul v4.4s, v4.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0x130]\n"
+                  "str q1, [%x[outptr5]], #0x10\n"
+                  "mul v5.4s, v5.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0x140]\n"
+                  "str q2, [%x[outptr5]], #0x10\n"
+                  "mla v3.4s, v6.4s, %[alpha_value].4s\n"
+                  "ldr q0, [%x[outptr7]]\n"
+                  "mla v4.4s, v7.4s, %[alpha_value].4s\n"
+                  "ldr q1, [%x[outptr7], #0x10]\n"
+                  "mla v5.4s, v8.4s, %[alpha_value].4s\n"
+                  "ldr q2, [%x[outptr7], #0x20]\n"
 
-                    // Row 7
-                    "mul v0.4s, v0.4s, %[beta_value].4s\n"
-                    "ldr q6, [%x[inptr], #0x150]\n"
-                    "str q3, [%x[outptr6]], #0x10\n"
-                    "mul v1.4s, v1.4s, %[beta_value].4s\n"
-                    "ldr q7, [%x[inptr], #0x160]\n"
-                    "str q4, [%x[outptr6]], #0x10\n"
-                    "mul v2.4s, v2.4s, %[beta_value].4s\n"
-                    "ldr q8, [%x[inptr], #0x170]\n"
-                    "str q5, [%x[outptr6]], #0x10\n"
-                    "mla v0.4s, v6.4s, %[alpha_value].4s\n"
-                    "mla v1.4s, v7.4s, %[alpha_value].4s\n"
-                    "mla v2.4s, v8.4s, %[alpha_value].4s\n"
-                    "str q0, [%x[outptr7]], #0x10\n"
-                    "str q1, [%x[outptr7]], #0x10\n"
-                    "str q2, [%x[outptr7]], #0x10\n"
+                  // Row 7
+                  "mul v0.4s, v0.4s, %[beta_value].4s\n"
+                  "ldr q6, [%x[inptr], #0x150]\n"
+                  "str q3, [%x[outptr6]], #0x10\n"
+                  "mul v1.4s, v1.4s, %[beta_value].4s\n"
+                  "ldr q7, [%x[inptr], #0x160]\n"
+                  "str q4, [%x[outptr6]], #0x10\n"
+                  "mul v2.4s, v2.4s, %[beta_value].4s\n"
+                  "ldr q8, [%x[inptr], #0x170]\n"
+                  "str q5, [%x[outptr6]], #0x10\n"
+                  "mla v0.4s, v6.4s, %[alpha_value].4s\n"
+                  "mla v1.4s, v7.4s, %[alpha_value].4s\n"
+                  "mla v2.4s, v8.4s, %[alpha_value].4s\n"
+                  "str q0, [%x[outptr7]], #0x10\n"
+                  "str q1, [%x[outptr7]], #0x10\n"
+                  "str q2, [%x[outptr7]], #0x10\n"
 
-                    "add %x[inptr], %x[inptr], #0x180\n"
-                    : [outptr0] "+r"(outptr0),
-                    [outptr1] "+r"(outptr1),
-                    [outptr2] "+r"(outptr2),
-                    [outptr3] "+r"(outptr3),
-                    [outptr4] "+r"(outptr4),
-                    [outptr5] "+r"(outptr5),
-                    [outptr6] "+r"(outptr6),
-                    [outptr7] "+r"(outptr7),
-                    [inptr] "+r"(inptr)
-                    : [alpha_value] "w"(alpha_value),
-                    [beta_value] "w"(beta_value)
-                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+                  "add %x[inptr], %x[inptr], #0x180\n"
+                  : [outptr0] "+r" (outptr0),
+                    [outptr1] "+r" (outptr1),
+                    [outptr2] "+r" (outptr2),
+                    [outptr3] "+r" (outptr3),
+                    [outptr4] "+r" (outptr4),
+                    [outptr5] "+r" (outptr5),
+                    [outptr6] "+r" (outptr6),
+                    [outptr7] "+r" (outptr7),
+                    [inptr] "+r" (inptr)
+                  : [alpha_value] "w" (alpha_value),
+                    [beta_value] "w" (beta_value)
+                  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"
+              );
             }
         }
     }
 }
 
-template <>
-inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta)
-{
-    // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
-    MergeResults<12, 8>(reinterpret_cast<int32_t *>(out), reinterpret_cast<const int32_t *>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+template<>
+inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta) {
+  // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
+  MergeResults<12, 8>(reinterpret_cast<int32_t*>(out), reinterpret_cast<const int32_t*>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
 }
 
 #endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/profiler.hpp b/src/core/NEON/kernels/arm_gemm/profiler.hpp
index ada0c95..1b944c4 100644
--- a/src/core/NEON/kernels/arm_gemm/profiler.hpp
+++ b/src/core/NEON/kernels/arm_gemm/profiler.hpp

@@ -31,75 +31,65 @@
 #include <mutex>
 #endif
 
-namespace arm_gemm
-{
+namespace arm_gemm {
+
 #ifndef NO_MULTI_THREADING
 extern std::mutex report_mutex;
 #endif
 
-class profiler
-{
+class profiler {
 private:
-    static const int maxevents         = 100000;
-    unsigned long    times[maxevents]  = {};
-    unsigned long    units[maxevents]  = {};
-    int              events[maxevents] = {};
-    int              currentevent      = 0;
-    int              countfd           = 0;
+    static const int maxevents = 100000;
+    unsigned long times[maxevents] = { };
+    unsigned long units[maxevents] = { };
+    int events[maxevents] = { };
+    int currentevent=0;
+    int countfd=0;
 
-    class ScopedProfilerClass
-    {
+    class ScopedProfilerClass {
     private:
         profiler &_parent;
-        bool      legal = false;
+        bool legal=false;
 
     public:
-        ScopedProfilerClass(profiler &prof, int i, unsigned long u)
-            : _parent(prof)
-        {
-            if(prof.currentevent == maxevents)
+        ScopedProfilerClass(profiler &prof, int i, unsigned long u) : _parent(prof) {
+            if (prof.currentevent==maxevents)
                 return;
 
-            prof.events[prof.currentevent] = i;
-            prof.units[prof.currentevent]  = u;
-            legal                          = true;
+            prof.events[prof.currentevent]=i;
+            prof.units[prof.currentevent]=u;
+            legal=true;
             start_counter(prof.countfd);
         }
 
-        ~ScopedProfilerClass()
-        {
-            if(!legal)
-                return;
+        ~ScopedProfilerClass() {
+            if (!legal) return;
 
-            long long cycs                        = stop_counter(_parent.countfd);
+            long long cycs = stop_counter(_parent.countfd);
             _parent.times[_parent.currentevent++] = cycs;
         }
     };
 
 public:
-    profiler()
-    {
-        countfd = open_cycle_counter();
+    profiler() {
+        countfd=open_cycle_counter();
     }
 
-    ~profiler()
-    {
+    ~profiler() {
         close(countfd);
-        int           tots[5];
+        int tots[5];
         unsigned long counts[5];
         unsigned long tunits[5];
-        const char   *descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+        const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
 
-        for(int i = 1; i < 5; i++)
-        {
-            tots[i]   = 0;
+        for (int i=1; i<5; i++) {
+            tots[i] = 0;
             counts[i] = 0;
             tunits[i] = 0;
         }
 
-        for(int i = 0; i < currentevent; i++)
-        {
-            //            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+        for (int i=0; i<currentevent; i++) {
+//            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
             tots[events[i]]++;
             counts[events[i]] += times[i];
             tunits[events[i]] += units[i];
@@ -113,31 +103,26 @@
 #endif
 
         printf("%20s  %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
-        for(int i = 1; i < 5; i++)
-        {
-            printf("%20s: %9d %9ld %9ld %12lu %9.2f\n", descs[i - 1], tots[i], counts[i], counts[i] / tots[i], tunits[i], (float)tunits[i] / counts[i]);
+        for (int i=1; i<5; i++) {
+            printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]);
         }
     }
 
     template <typename T>
-    void operator()(int i, unsigned long u, T func)
-    {
-        if(currentevent == maxevents)
-        {
+    void operator() (int i, unsigned long u, T func) {
+        if (currentevent==maxevents) {
             func();
-        }
-        else
-        {
+        } else {
             events[currentevent] = i;
-            units[currentevent]  = u;
+            units[currentevent] = u;
             start_counter(countfd);
             func();
-            long long cycs        = stop_counter(countfd);
+            long long cycs = stop_counter(countfd);
             times[currentevent++] = cycs;
         }
     }
-    ScopedProfilerClass ScopedProfiler(int i, unsigned long u)
-    {
+
+    ScopedProfilerClass ScopedProfiler(int i, unsigned long u) {
         return ScopedProfilerClass(*this, i, u);
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index c80bb59..35e61b0 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp

@@ -35,63 +35,51 @@
  * being a multiple of the block sizes.
  */
 template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
-struct TransformImpl
-{
+struct TransformImpl {
     template <typename TOut, typename TIn>
-    static void Transform(TOut *out, const TIn *const in, const int stride,
-                          const int y0, const int ymax, const int x0, const int xmax)
-    {
+    static void Transform(TOut* out, const TIn* const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax) {
         const int n_whole_y_blocks = (ymax - y0) / IntBy;
-        const int y_remainders     = (ymax - y0) % IntBy;
-        const int n_y_blocks       = n_whole_y_blocks + (y_remainders ? 1 : 0);
+        const int y_remainders = (ymax - y0) % IntBy;
+        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
 
         const int n_whole_x_blocks = (xmax - x0) / BlockBy;
-        const int x_remainders     = (xmax - x0) % BlockBy;
-        const int n_x_blocks       = n_whole_x_blocks + (x_remainders ? 1 : 0);
+        const int x_remainders = (xmax - x0) % BlockBy;
+        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
 
         // "Y" loop: advance down the rows of the source IntBy rows at a time.
         // Set up fill_rows to show the number rows to copy from, and blank_rows
         // for the number of blank rows to add.
-        for(int y_block = 0; y_block < n_y_blocks; y_block++)
-        {
-            int fill_rows  = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
+            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
             int blank_rows = IntBy - fill_rows;
 
             int y_base = y0 + (y_block * IntBy);
 
             // So now advance along this block of rows, BlockBy columns at a time.
-            for(int x_block = 0; x_block < n_x_blocks; x_block++)
-            {
-                int fill_cols  = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
+                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
                 int blank_cols = BlockBy - fill_cols;
 
                 int x_base = x0 + (x_block * BlockBy);
 
-                for(int row = 0; row < fill_rows; row++)
-                {
-                    for(int col = 0; col < fill_cols; col++)
-                    {
+                for (int row = 0; row < fill_rows; row++) {
+                    for (int col = 0; col < fill_cols; col++) {
                         // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
-                        if(Transposed)
-                        {
+                        if (Transposed) {
                             *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
-                        }
-                        else
-                        {
+                        } else {
                             *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
                         }
                     }
                     // "col" tail - row is in range but column is out of range.
-                    for(int col = 0; col < blank_cols; col++)
-                    {
+                    for (int col=0; col < blank_cols; col++) {
                         *out++ = static_cast<TOut>(0);
                     }
                 }
                 // "row" tail - row is out of range so fill with zeros always.
-                for(int row = 0; row < blank_rows; row++)
-                {
-                    for(int col = 0; col < (fill_cols + blank_cols); col++)
-                    {
+                for (int row = 0; row < blank_rows; row++) {
+                    for (int col=0; col < (fill_cols + blank_cols); col++) {
                         *out++ = static_cast<TOut>(0);
                     }
                 }
@@ -100,9 +88,8 @@
     }
 
     template <typename T>
-    static inline void Transform(T *out, const T *const in, const int stride,
-                                 const int k0, const int kmax, const int x0, const int xmax)
-    {
+    static inline void Transform(T* out, const T* const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax) {
         Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
     }
 };
@@ -110,13 +97,15 @@
 /*****************************************************************************/
 template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
 void Transform(
-    TOut *out, const TIn *const in, const int stride,
-    const int k0, const int kmax, const int x0, const int xmax)
-{
-    // Redirect to a specialised implementation predicated on argument size.
-    TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
-        out, in, stride, k0, kmax, x0, xmax);
+  TOut* out, const TIn* const in, const int stride,
+  const int k0, const int kmax, const int x0, const int xmax
+) {
+  // Redirect to a specialised implementation predicated on argument size.
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+    out, in, stride, k0, kmax, x0, xmax
+  );
 }
 /*****************************************************************************/
 
 #include "transforms/list.hpp"
+

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
index 501d6bf..e485ca7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp

@@ -29,17 +29,15 @@
 
 #include "../asmlib.hpp"
 
-template <>
-template <typename T>
-inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint32_t       *outptr = reinterpret_cast<uint32_t *>(out);
-    const uint32_t *inptr  = reinterpret_cast<const uint32_t *>(in);
+template<>
+template<typename T>
+inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
+    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
 
     uint32_t zerobuff[8];
 
-    for(int y = y0; y < ymax; y += 6)
-    {
+    for (int y=y0; y<ymax; y+=6) {
         const uint32_t *inptr0 = inptr + y * ldin + k0;
         const uint32_t *inptr1 = inptr0 + ldin;
         const uint32_t *inptr2 = inptr1 + ldin;
@@ -54,14 +52,11 @@
         //prefetch_2x(inptr4);
         //prefetch_2x(inptr5);
 
-        int x = (kmax - k0);
-        for(; x > 7; x -= 8)
-        {
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
             /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if((y + 5) >= ymax)
-            {
-                switch((y + 5) - ymax)
-                {
+            if ((y + 5) >= ymax) {
+                switch ((y + 5) - ymax) {
                     /* Everything falls through in here */
                     case 4:
                         inptr1 = zerobuff;
@@ -80,67 +75,73 @@
                 }
             }
 
-            __asm __volatile(
+
+            __asm __volatile (
                 // Load up 8 elements (2 vectors) from each of 8 sources.
-                "VLD1.32    {d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
-                "VLD1.32    {d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
-                "VLD1.32    {d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
-                "VZIP.32    q0, q4\n"                  // q0=A0C0A1C1, q4 = A2C2A3C3
-                "VLD1.32    {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
-                "VZIP.32    q2, q6\n"                  // q2=B0D0B1D1, q6 = B2D2B3D3
-                "VLD1.32    {d16-d19}, [%[inptr4]]!\n"
-                "VLD1.32    {d20-d23}, [%[inptr5]]!\n"
-                "VZIP.32    q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+                "VLD1.32	{d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
+                "VLD1.32	{d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
+                "VLD1.32	{d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
+                "VZIP.32	q0, q4\n"     // q0=A0C0A1C1, q4 = A2C2A3C3
+                "VLD1.32	{d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+                "VZIP.32	q2, q6\n"     // q2=B0D0B1D1, q6 = B2D2B3D3
+                "VLD1.32	{d16-d19}, [%[inptr4]]!\n"
+                "VLD1.32	{d20-d23}, [%[inptr5]]!\n"
+                "VZIP.32	q8, q10\n"    // q8=E0F0E1F1, q10 = E2F2E3F3
                 ASM_PREFETCH("[%[inptr0], #128]")
-                "VZIP.32    q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+                "VZIP.32	q0, q2\n"    // q0 = A0B0C0D0, q2 = A1B1C1D1
 
                 // Store first elements
-                "VST1.32    {d0-d1}, [%[outptr]]!\n"
-                "VST1.32    {d16}, [%[outptr]]!\n"
+                "VST1.32	{d0-d1}, [%[outptr]]!\n"
+                "VST1.32	{d16}, [%[outptr]]!\n"
 
-                "VZIP.32    q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+                "VZIP.32	q4, q6\n"    // q4 = A2B2C2D2, q6 = A3B3C3D3
 
                 // Store second elements
-                "VST1.32    {d4-d5}, [%[outptr]]!\n"
-                "VZIP.32    q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]")
-                "VST1.32    {d17}, [%[outptr]]!\n"
-                "VZIP.32    q3, q7\n"
+                "VST1.32	{d4-d5}, [%[outptr]]!\n"
+                "VZIP.32	q1, q5\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "VST1.32	{d17}, [%[outptr]]!\n"
+                "VZIP.32	q3, q7\n"
 
                 // Store third elements
-                "VZIP.32    q9, q11\n"
-                "VST1.32    {d8-d9}, [%[outptr]]!\n"
-                "VZIP.32    q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]")
-                "VST1.32    {d20}, [%[outptr]]!\n"
+                "VZIP.32	q9, q11\n"
+                "VST1.32	{d8-d9}, [%[outptr]]!\n"
+                "VZIP.32	q1, q3\n"
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "VST1.32	{d20}, [%[outptr]]!\n"
 
                 // Store fourth elements
-                "VZIP.32    q5, q7\n"
-                "VST1.32    {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]")
-                "VST1.32    {d21}, [%[outptr]]!\n"
+                "VZIP.32	q5, q7\n"
+                "VST1.32	{d12-d13}, [%[outptr]]!\n"
+                ASM_PREFETCH("[%[inptr3], #128]")
+                "VST1.32	{d21}, [%[outptr]]!\n"
 
                 // Fifth
-                "VST1.32    {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]")
-                "VST1.32    {d18}, [%[outptr]]!\n"
+                "VST1.32	{d2-d3}, [%[outptr]]!\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "VST1.32	{d18}, [%[outptr]]!\n"
 
                 // Sixth
-                "VST1.32    {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]")
-                "VST1.32    {d19}, [%[outptr]]!\n"
+                "VST1.32	{d6-d7}, [%[outptr]]!\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "VST1.32	{d19}, [%[outptr]]!\n"
 
                 // Seventh
-                "VST1.32    {d10-d11}, [%[outptr]]!\n"
-                "VST1.32    {d22}, [%[outptr]]!\n"
+                "VST1.32	{d10-d11}, [%[outptr]]!\n"
+                "VST1.32	{d22}, [%[outptr]]!\n"
 
                 // Eighth
-                "VST1.32    {d14-d15}, [%[outptr]]!\n"
-                "VST1.32    {d23}, [%[outptr]]!\n"
+                "VST1.32	{d14-d15}, [%[outptr]]!\n"
+                "VST1.32	{d23}, [%[outptr]]!\n"
 
-                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr)
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
                 :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12");
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
+            );
         }
 
-        for(; x > 0; x--)
-        {
+        for (;x>0;x--) {
             *outptr++ = *inptr0++;
             *outptr++ = *inptr1++;
             *outptr++ = *inptr2++;
@@ -151,4 +152,4 @@
     }
 }
 
-#endif // __arm__
+#endif  // __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index ea32c96..a7e17fa 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp

@@ -31,86 +31,97 @@
 template <>
 template <typename T>
 inline void TransformImpl<8, 1, true, 4, 4>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a 16x uint16_t specialisation
-    TransformImpl<16, 1, true, 2, 2>::Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a 16x uint16_t specialisation
+  TransformImpl<16, 1, true, 2, 2>::Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride*2, x0*2, xmax*2, k0, kmax
+  );
 }
 
 // Generic 12x16-bit sized specialisation
 template <>
 template <typename T>
 inline void TransformImpl<16, 1, true, 2, 2>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a uint16_t specialisation
-    Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride, x0, xmax, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a uint16_t specialisation
+  Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride, x0, xmax, k0, kmax
+  );
 }
 
 // Specialised 16 x uint16_t version
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
-{
-    __asm volatile(
-        "VLD1.32    {d0-d3}, [%[in0]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        : [in0] "+r"(in0),
-        [out] "+r"(out)
-        :
-        : "q0", "q1", "memory");
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+  __asm volatile (
+    "VLD1.32	{d0-d3}, [%[in0]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]\n"
+    ASM_PREFETCH("[%[in0], #192]")
+    : [in0] "+r" (in0),
+      [out] "+r" (out)
+    :
+    : "q0", "q1", "memory"
+  );
 }
 
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
-{
-    __asm volatile(
-        "VLD1.32    {d0-d3}, [%[in0]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
-        "VLD1.32    {d0-d3}, [%[in1]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB    %[out], %[out], #32\n"
-        : [in0] "+r"(in0),
-        [in1] "+r"(in1),
-        [out] "+r"(out)
-        :
-        : "q0", "q1", "memory");
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
+  __asm volatile (
+    "VLD1.32	{d0-d3}, [%[in0]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]!\n"
+    ASM_PREFETCH("[%[in0], #192]")
+    "VLD1.32	{d0-d3}, [%[in1]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]\n"
+    ASM_PREFETCH("[%[in1], #192]")
+    "SUB	%[out], %[out], #32\n"
+    : [in0] "+r" (in0),
+      [in1] "+r" (in1),
+      [out] "+r" (out)
+    :
+    : "q0", "q1", "memory"
+  );
 }
 
 template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
-{
-    __asm __volatile(
-        "VLD1.32    {d0-d3}, [%[in0]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
-        "VLD1.32    {d0-d3}, [%[in1]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]")
-        "VLD1.32    {d0-d3}, [%[in2]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]")
-        "VLD1.32    {d0-d3}, [%[in3]]!\n"
-        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB    %[out], %[out], #96\n"
-        : [in0] "+r"(in0),
-        [in1] "+r"(in1),
-        [in2] "+r"(in2),
-        [in3] "+r"(in3),
-        [out] "+r"(out)
-        :
-        : "q0", "q1", "memory");
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+  __asm __volatile (
+    "VLD1.32	{d0-d3}, [%[in0]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]!\n"
+    ASM_PREFETCH("[%[in0], #192]")
+    "VLD1.32	{d0-d3}, [%[in1]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]!\n"
+    ASM_PREFETCH("[%[in1], #192]")
+    "VLD1.32	{d0-d3}, [%[in2]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]!\n"
+    ASM_PREFETCH("[%[in2], #192]")
+    "VLD1.32	{d0-d3}, [%[in3]]!\n"
+    "VST1.32	{d0-d3}, [%[out]]\n"
+    ASM_PREFETCH("[%[in3], #192]")
+    "SUB	%[out], %[out], #96\n"
+    : [in0] "+r" (in0),
+      [in1] "+r" (in1),
+      [in2] "+r" (in2),
+      [in3] "+r" (in3),
+      [out] "+r" (out)
+    :
+    : "q0", "q1", "memory"
+  );
 }
 
 template <>
 template <>
 inline void TransformImpl<16, 1, true, 2, 2>::Transform(
-    uint16_t *out, const uint16_t *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+    uint16_t* out, const uint16_t* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
 }
 
 #endif // __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
index 8d61f15..7e61f42 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp

@@ -30,17 +30,15 @@
 #include "../asmlib.hpp"
 #include "../utils.hpp"
 
-template <>
-template <typename T>
-void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint8_t       *outptr = (uint8_t *)out;
-    const uint8_t *inptr  = (uint8_t *)in;
+template<>
+template<typename T>
+void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint8_t *outptr = (uint8_t *)out;
+    const uint8_t *inptr = (uint8_t *)in;
 
     uint8_t zerobuff[16];
 
-    for(int y = y0; y < ymax; y += 4)
-    {
+    for (int y=y0; y<ymax; y+=4) {
         const uint8_t *inptr0 = inptr + y * ldin + k0;
         const uint8_t *inptr1 = inptr0 + ldin;
         const uint8_t *inptr2 = inptr1 + ldin;
@@ -51,14 +49,11 @@
         prefetch_2x(inptr2);
         prefetch_2x(inptr3);
 
-        int x = (kmax - k0);
-        for(; x > 15; x -= 16)
-        {
+        int x=(kmax-k0);
+        for (;x>15;x-=16) {
             /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if((y + 3) >= ymax)
-            {
-                switch((y + 3) - ymax)
-                {
+            if ((y + 3) >= ymax) {
+                switch ((y + 3) - ymax) {
                     /* Everything falls through in here */
                     case 2:
                         inptr1 = zerobuff;
@@ -73,23 +68,28 @@
                 }
             }
 
-            __asm __volatile(
-                "LDR    q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR    q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]")
-                "STP    q0, q1, [%[outptr]], #32\n"
-                "LDR    q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR    q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP    q0, q1, [%[outptr]], #32\n"
-                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-                [outptr] "+r"(outptr)
+            __asm __volatile (
+                "LDR	q0, [%[inptr0]], #16\n"
+                ASM_PREFETCH("[%[inptr0], #176]")
+                "LDR	q1, [%[inptr1]], #16\n"
+                ASM_PREFETCH("[%[inptr1], #176]")
+                "STP	q0, q1, [%[outptr]], #32\n"
+                "LDR	q0, [%[inptr2]], #16\n"
+                ASM_PREFETCH("[%[inptr2], #176]")
+                "LDR	q1, [%[inptr3]], #16\n"
+                ASM_PREFETCH("[%[inptr3], #176]")
+                "STP	q0, q1, [%[outptr]], #32\n"
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [outptr] "+r" (outptr)
                 :
-                : "v0", "v1");
+                : "v0", "v1"
+            );
         }
 
-        if(x > 0)
-        {
+        if (x>0) {
             /* Need to duplicate this here, in case we didn't run the main loop. */
-            if((y + 3) >= ymax)
-            {
-                switch((y + 3) - ymax)
-                {
+            if ((y + 3) >= ymax) {
+                switch ((y + 3) - ymax) {
                     /* Everything falls through in here */
                     case 2:
                         inptr1 = zerobuff;
@@ -105,16 +105,11 @@
             }
 
             /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
-            auto f = [&outptr, x](const uint8_t *&p)
-            {
-                for(int i = 0; i < 16; i++)
-                {
-                    if(i < x)
-                    {
+            auto f = [&outptr, x](const uint8_t *&p) {
+                for (int i=0; i<16; i++) {
+                    if (i < x) {
                         *outptr++ = *p++;
-                    }
-                    else
-                    {
+                    } else {
                         *outptr++ = 0;
                     }
                 }
@@ -128,4 +123,4 @@
     }
 }
 
-#endif // __aarch64__
\ No newline at end of file
+#endif  // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
index 3cbc881..99bb2d6 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp

@@ -29,17 +29,15 @@
 
 #include "../asmlib.hpp"
 
-template <>
-template <typename T>
-void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint16_t       *outptr = (uint16_t *)out;
-    const uint16_t *inptr  = (const uint16_t *)in;
+template<>
+template<typename T>
+void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint16_t *outptr = (uint16_t *)out;
+    const uint16_t *inptr = (const uint16_t *)in;
 
     uint16_t zerobuff[24];
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         const uint16_t *inptr0 = inptr + y * ldin + k0;
         const uint16_t *inptr1 = inptr0 + ldin;
         const uint16_t *inptr2 = inptr1 + ldin;
@@ -58,14 +56,11 @@
         prefetch_2x(inptr6);
         prefetch_2x(inptr7);
 
-        int x = (kmax - k0);
-        for(; x > 7; x -= 8)
-        {
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
             /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
@@ -89,72 +84,74 @@
             }
 
             int skippf = (x & 31);
-            __asm __volatile(
+            __asm __volatile (
                 // Load up 8 elements (1 vector) from each of 8 sources.
-                "CBNZ    %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]")
+                "CBNZ	%w[skippf], 1f\n"
+                ASM_PREFETCH("[%[inptr0], #128]")
                 ASM_PREFETCH("[%[inptr1], #128]")
                 ASM_PREFETCH("[%[inptr2], #128]")
                 ASM_PREFETCH("[%[inptr3], #128]")
                 "1:\n"
 
-                "LDR    q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
-                "LDR    q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
-                "LDR    q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
-                "LDR    q6, [%[inptr6]], #16\n"
-                "ZIP1    v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
-                "ZIP2    v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
-                "ZIP1    v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
-                "ZIP2    v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
-                "LDR    q1, [%[inptr1]], #16\n"  // q1=B0B1B2B3B4B5B6B7
-                "LDR    q5, [%[inptr5]], #16\n"
-                "LDR    q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
-                "LDR    q7, [%[inptr7]], #16\n"
-                "ZIP1    v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
-                "ZIP2    v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
-                "ZIP1    v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
-                "ZIP2    v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
+                "LDR	q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
+                "LDR	q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
+                "LDR	q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
+                "LDR	q6, [%[inptr6]], #16\n"
+                "ZIP1	v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
+                "ZIP2	v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
+                "ZIP1	v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
+                "ZIP2	v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
+                "LDR	q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
+                "LDR	q5, [%[inptr5]], #16\n"
+                "LDR	q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
+                "LDR	q7, [%[inptr7]], #16\n"
+                "ZIP1	v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
+                "ZIP2	v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
+                "ZIP1	v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
+                "ZIP2	v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
 
-                "ZIP1    v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
-                "ZIP2    v20.8h,  v8.8h,  v9.8h\n"
-                "ZIP1    v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
-                "ZIP2    v21.8h, v10.8h, v11.8h\n"
+                "ZIP1	v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
+                "ZIP2	v20.8h,  v8.8h,  v9.8h\n"
+                "ZIP1	v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
+                "ZIP2	v21.8h, v10.8h, v11.8h\n"
 
-                "CBNZ    %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]")
+                "CBNZ	%w[skippf], 2f\n"
+                ASM_PREFETCH("[%[inptr4], #112]")
                 ASM_PREFETCH("[%[inptr5], #112]")
                 ASM_PREFETCH("[%[inptr6], #112]")
                 ASM_PREFETCH("[%[inptr7], #112]")
                 "2:\n"
 
-                "ZIP1    v22.8h, v16.8h, v17.8h\n"
-                "ZIP2    v30.8h, v16.8h, v17.8h\n"
-                "ZIP1    v23.8h, v18.8h, v19.8h\n"
-                "ZIP2    v31.8h, v18.8h, v19.8h\n"
+                "ZIP1	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v30.8h, v16.8h, v17.8h\n"
+                "ZIP1	v23.8h, v18.8h, v19.8h\n"
+                "ZIP2	v31.8h, v18.8h, v19.8h\n"
 
-                "ZIP1    v14.8h, v12.8h, v13.8h\n"    // q22=A0B0C0D0E0F0G0H0
-                "ZIP2    v15.8h, v12.8h, v13.8h\n"    // q23=A1B1C1D1E1F1G1H1
-                "STP    q14, q15, [%[outptr]], #32\n" // Write back first two elements
+                "ZIP1	v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
+                "ZIP2	v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
+                "STP	q14, q15, [%[outptr]], #32\n" // Write back first two elements
 
-                "ZIP1    v0.8h, v20.8h, v21.8h\n"
-                "ZIP2    v1.8h, v20.8h, v21.8h\n"
-                "STP    q0, q1, [%[outptr]], #32\n" // Write back next two elements
+                "ZIP1	v0.8h, v20.8h, v21.8h\n"
+                "ZIP2	v1.8h, v20.8h, v21.8h\n"
+                "STP	q0, q1, [%[outptr]], #32\n" // Write back next two elements
 
-                "ZIP1    v2.8h, v22.8h, v23.8h\n"
-                "ZIP2    v3.8h, v22.8h, v23.8h\n"
-                "STP    q2, q3, [%[outptr]], #32\n" // Write back next two elements
+                "ZIP1	v2.8h, v22.8h, v23.8h\n"
+                "ZIP2	v3.8h, v22.8h, v23.8h\n"
+                "STP	q2, q3, [%[outptr]], #32\n" // Write back next two elements
 
-                "ZIP1    v4.8h, v30.8h, v31.8h\n"
-                "ZIP2    v5.8h, v30.8h, v31.8h\n"
-                "STP    q4, q5, [%[outptr]], #32\n" // Write back last two elements
-                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
-                : [skippf] "r"(skippf)
+                "ZIP1	v4.8h, v30.8h, v31.8h\n"
+                "ZIP2	v5.8h, v30.8h, v31.8h\n"
+                "STP	q4, q5, [%[outptr]], #32\n" // Write back last two elements
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                : [skippf] "r" (skippf)
                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-                "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+                  "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
         }
 
-        for(; x > 0; x--)
-        {
+        for (;x>0;x--) {
             *outptr++ = *inptr0++;
             *outptr++ = *inptr1++;
             *outptr++ = *inptr2++;

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 47e4fa2..83391cc 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp

@@ -29,17 +29,15 @@
 
 #include "../asmlib.hpp"
 
-template <>
-template <typename T>
-inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    uint32_t       *outptr = (uint32_t *)out;
-    const uint32_t *inptr  = (uint32_t *)in;
+template<>
+template<typename T>
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint32_t *outptr = (uint32_t *)out;
+    const uint32_t *inptr = (uint32_t *)in;
 
     uint32_t zerobuff[8];
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         const uint32_t *inptr0 = inptr + y * ldin + k0;
         const uint32_t *inptr1 = inptr0 + ldin;
         const uint32_t *inptr2 = inptr1 + ldin;
@@ -58,14 +56,11 @@
         prefetch_2x(inptr6);
         prefetch_2x(inptr7);
 
-        int x = (kmax - k0);
-        for(; x > 7; x -= 8)
-        {
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
             /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
@@ -88,19 +83,20 @@
                 }
             }
 
-            __asm __volatile(
+            __asm __volatile (
                 // Load up 8 elements (2 vectors) from each of 8 sources.
                 "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
                 "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
                 "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
-                "ZIP1       v16.4s, v0.4s, v4.4s\n"     // q16=A0C0A1C1
+                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
                 ASM_PREFETCH("[%[inptr0], #128]")
                 "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
-                "ZIP1       v17.4s, v2.4s, v6.4s\n"     // q17=B0D0B1D1
+                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
                 "LDP        q8, q9, [%[inptr4]], #32\n"
                 "LDP        q10, q11, [%[inptr5]], #32\n"
                 "LDP        q12, q13, [%[inptr6]], #32\n"
-                "ZIP1       v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "ZIP1       v18.4s, v8.4s, v12.4s\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
                 "LDP        q14, q15, [%[inptr7]], #32\n"
                 "ZIP1       v19.4s, v10.4s, v14.4s\n"
 
@@ -110,7 +106,8 @@
                 "ZIP2       v22.4s, v16.4s, v17.4s\n"
                 "ZIP2       v23.4s, v18.4s, v19.4s\n"
 
-                "ZIP2       v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+                "ZIP2       v16.4s, v0.4s, v4.4s\n"
+                ASM_PREFETCH("[%[inptr3], #128]")
                 "ZIP2       v17.4s, v2.4s, v6.4s\n"
                 "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
 
@@ -118,12 +115,14 @@
                 "ZIP2       v19.4s, v10.4s, v14.4s\n"
                 "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
 
-                "ZIP1       v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
                 "ZIP1       v21.4s, v18.4s, v19.4s\n"
                 "ZIP2       v22.4s, v16.4s, v17.4s\n"
                 "ZIP2       v23.4s, v18.4s, v19.4s\n"
 
-                "ZIP1       v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP1       v16.4s, v1.4s, v5.4s\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
                 "ZIP1       v17.4s, v3.4s, v7.4s\n"
                 "STP        q20, q21, [%[outptr]], #32\n" // Third element
 
@@ -133,14 +132,16 @@
 
                 "ZIP1       v20.4s, v16.4s, v17.4s\n"
                 "ZIP1       v21.4s, v18.4s, v19.4s\n"
-                "ZIP2       v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
                 "ZIP2       v23.4s, v18.4s, v19.4s\n"
 
                 "ZIP2       v16.4s, v1.4s, v5.4s\n"
                 "ZIP2       v17.4s, v3.4s, v7.4s\n"
                 "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
 
-                "ZIP2       v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]")
+                "ZIP2       v18.4s, v9.4s, v13.4s\n"
+                ASM_PREFETCH("[%[inptr7], #128]")
                 "ZIP2       v19.4s, v11.4s, v15.4s\n"
                 "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
 
@@ -151,15 +152,15 @@
                 "ZIP2       v22.4s, v16.4s, v17.4s\n"
                 "ZIP2       v23.4s, v18.4s, v19.4s\n"
                 "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
                 :
                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            );
         }
 
-        for(; x > 0; x--)
-        {
+        for (;x>0;x--) {
             *outptr++ = *inptr0++;
             *outptr++ = *inptr1++;
             *outptr++ = *inptr2++;
@@ -172,4 +173,4 @@
     }
 }
 
-#endif // __aarch64__
+#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
index 1d2d496..fd81216 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp

@@ -29,17 +29,15 @@
 
 #include "../asmlib.hpp"
 
-template <>
-template <>
-inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
-    float        *outptr = out;
-    const __fp16 *inptr  = in;
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    float *outptr = out;
+    const __fp16 *inptr = in;
 
     __fp16 zerobuff[8];
 
-    for(int y = y0; y < ymax; y += 8)
-    {
+    for (int y=y0; y<ymax; y+=8) {
         const __fp16 *inptr0 = inptr + y * ldin + k0;
         const __fp16 *inptr1 = inptr0 + ldin;
         const __fp16 *inptr2 = inptr1 + ldin;
@@ -58,14 +56,11 @@
         prefetch_2x(inptr6);
         prefetch_2x(inptr7);
 
-        int x = (kmax - k0);
-        for(; x > 7; x -= 8)
-        {
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
             /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if((y + 7) >= ymax)
-            {
-                switch((y + 7) - ymax)
-                {
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
                     /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
@@ -88,95 +83,100 @@
                 }
             }
 
-            __asm __volatile(
+            __asm __volatile (
                 // Load up 8 elements (2 vectors) from each of 8 sources.
-                "LDR    q0, [%[inptr0]], #16\n"
-                "LDR    q2, [%[inptr1]], #16\n"
-                "FCVTL2    v1.4s, v0.8h\n"
-                "FCVTL    v0.4s, v0.4h\n"
-                "LDR    q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
-                "FCVTL2    v3.4s, v2.8h\n"
-                "FCVTL    v2.4s, v2.4h\n"
-                "FCVTL2    v5.4s, v4.8h\n"
-                "FCVTL    v4.4s, v4.4h\n"
-                "ZIP1    v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+                "LDR	q0, [%[inptr0]], #16\n"
+                "LDR	q2, [%[inptr1]], #16\n"
+                "FCVTL2	v1.4s, v0.8h\n"
+                "FCVTL	v0.4s, v0.4h\n"
+                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "FCVTL2	v3.4s, v2.8h\n"
+                "FCVTL	v2.4s, v2.4h\n"
+                "FCVTL2	v5.4s, v4.8h\n"
+                "FCVTL	v4.4s, v4.4h\n"
+                "ZIP1	v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
                 ASM_PREFETCH("[%[inptr0], #128]")
-                "LDR    q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
-                "FCVTL2    v7.4s, v6.8h\n"
-                "FCVTL    v6.4s, v6.4h\n"
-                "ZIP1    v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
-                "LDR    q8, [%[inptr4]], #16\n"
-                "LDR    q10, [%[inptr5]], #16\n"
-                "FCVTL2    v9.4s, v8.8h\n"
-                "FCVTL    v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]")
-                "LDR    q12, [%[inptr6]], #16\n"
-                "FCVTL2    v11.4s, v10.8h\n"
-                "FCVTL    v10.4s, v10.4h\n"
-                "FCVTL2    v13.4s, v12.8h\n"
-                "FCVTL    v12.4s, v12.4h\n"
-                "ZIP1    v18.4s, v8.4s, v12.4s\n"
-                "LDR    q14, [%[inptr7]], #16\n"
-                "FCVTL2    v15.4s, v14.8h\n"
-                "FCVTL    v14.4s, v14.4h\n"
-                "ZIP1    v19.4s, v10.4s, v14.4s\n"
+                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "FCVTL2	v7.4s, v6.8h\n"
+                "FCVTL	v6.4s, v6.4h\n"
+                "ZIP1	v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+                "LDR	q8, [%[inptr4]], #16\n"
+                "LDR	q10, [%[inptr5]], #16\n"
+                "FCVTL2	v9.4s, v8.8h\n"
+                "FCVTL	v8.4s, v8.4h\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR	q12, [%[inptr6]], #16\n"
+                "FCVTL2	v11.4s, v10.8h\n"
+                "FCVTL	v10.4s, v10.4h\n"
+                "FCVTL2	v13.4s, v12.8h\n"
+                "FCVTL	v12.4s, v12.4h\n"
+                "ZIP1	v18.4s, v8.4s, v12.4s\n"
+                "LDR	q14, [%[inptr7]], #16\n"
+                "FCVTL2	v15.4s, v14.8h\n"
+                "FCVTL	v14.4s, v14.4h\n"
+                "ZIP1	v19.4s, v10.4s, v14.4s\n"
 
                 ASM_PREFETCH("[%[inptr2], #128]")
-                "ZIP1    v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
-                "ZIP1    v21.4s, v18.4s, v19.4s\n"
-                "ZIP2    v22.4s, v16.4s, v17.4s\n"
-                "ZIP2    v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+                "ZIP1	v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                "ZIP1	v21.4s, v18.4s, v19.4s\n"
+                "ZIP2	v22.4s, v16.4s, v17.4s\n"
+                "ZIP2	v23.4s, v18.4s, v19.4s\n"
+                ASM_PREFETCH("[%[inptr3], #128]")
 
-                "ZIP2    v16.4s, v0.4s, v4.4s\n"
-                "ZIP2    v17.4s, v2.4s, v6.4s\n"
-                "STP    q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+                "ZIP2	v16.4s, v0.4s, v4.4s\n"
+                "ZIP2	v17.4s, v2.4s, v6.4s\n"
+                "STP	q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
 
-                "ZIP2    v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
-                "ZIP2    v19.4s, v10.4s, v14.4s\n"
-                "STP    q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+                "ZIP2	v18.4s, v8.4s, v12.4s\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2	v19.4s, v10.4s, v14.4s\n"
+                "STP	q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
 
-                "ZIP1    v20.4s, v16.4s, v17.4s\n"
-                "ZIP1    v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
-                "ZIP2    v22.4s, v16.4s, v17.4s\n"
-                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+                "ZIP1	v20.4s, v16.4s, v17.4s\n"
+                "ZIP1	v21.4s, v18.4s, v19.4s\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2	v22.4s, v16.4s, v17.4s\n"
+                "ZIP2	v23.4s, v18.4s, v19.4s\n"
 
-                "ZIP1    v16.4s, v1.4s, v5.4s\n"
-                "ZIP1    v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
-                "STP    q20, q21, [%[outptr]], #32\n" // Third element
+                "ZIP1	v16.4s, v1.4s, v5.4s\n"
+                "ZIP1	v17.4s, v3.4s, v7.4s\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "STP	q20, q21, [%[outptr]], #32\n" // Third element
 
-                "ZIP1    v18.4s, v9.4s, v13.4s\n"
-                "ZIP1    v19.4s, v11.4s, v15.4s\n"
-                "STP    q22, q23, [%[outptr]], #32\n" // Fourth element
+                "ZIP1	v18.4s, v9.4s, v13.4s\n"
+                "ZIP1	v19.4s, v11.4s, v15.4s\n"
+                "STP	q22, q23, [%[outptr]], #32\n" // Fourth element
                 ASM_PREFETCH("[%[inptr7], #128]")
 
-                "ZIP1    v20.4s, v16.4s, v17.4s\n"
-                "ZIP1    v21.4s, v18.4s, v19.4s\n"
-                "ZIP2    v22.4s, v16.4s, v17.4s\n"
-                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+                "ZIP1	v20.4s, v16.4s, v17.4s\n"
+                "ZIP1	v21.4s, v18.4s, v19.4s\n"
+                "ZIP2	v22.4s, v16.4s, v17.4s\n"
+                "ZIP2	v23.4s, v18.4s, v19.4s\n"
 
-                "ZIP2    v16.4s, v1.4s, v5.4s\n"
-                "ZIP2    v17.4s, v3.4s, v7.4s\n"
-                "STP    q20, q21, [%[outptr]], #32\n" // Fifth element
+                "ZIP2	v16.4s, v1.4s, v5.4s\n"
+                "ZIP2	v17.4s, v3.4s, v7.4s\n"
+                "STP	q20, q21, [%[outptr]], #32\n" // Fifth element
 
-                "ZIP2    v18.4s, v9.4s, v13.4s\n"
-                "ZIP2    v19.4s, v11.4s, v15.4s\n"
-                "STP    q22, q23, [%[outptr]], #32\n" // Sixth element
+                "ZIP2	v18.4s, v9.4s, v13.4s\n"
+                "ZIP2	v19.4s, v11.4s, v15.4s\n"
+                "STP	q22, q23, [%[outptr]], #32\n" // Sixth element
 
-                "ZIP1    v20.4s, v16.4s, v17.4s\n"
-                "ZIP1    v21.4s, v18.4s, v19.4s\n"
-                "STP    q20, q21, [%[outptr]], #32\n" // Seventh element
+                "ZIP1	v20.4s, v16.4s, v17.4s\n"
+                "ZIP1	v21.4s, v18.4s, v19.4s\n"
+                "STP	q20, q21, [%[outptr]], #32\n" // Seventh element
 
-                "ZIP2    v22.4s, v16.4s, v17.4s\n"
-                "ZIP2    v23.4s, v18.4s, v19.4s\n"
-                "STP    q22, q23, [%[outptr]], #32\n" // Eighth element
-                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                "ZIP2	v22.4s, v16.4s, v17.4s\n"
+                "ZIP2	v23.4s, v18.4s, v19.4s\n"
+                "STP	q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
                 :
                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            );
         }
 
-        for(; x > 0; x--)
-        {
+        for (;x>0;x--) {
             *outptr++ = *inptr0++;
             *outptr++ = *inptr1++;
             *outptr++ = *inptr2++;

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
index fd6a253..6e07064 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp

@@ -31,105 +31,115 @@
 template <>
 template <typename T>
 inline void TransformImpl<6, 1, true, 4, 4>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a 12 x uint16_t specialisation
-    TransformImpl<12, 1, true, 2, 2>::Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a 12 x uint16_t specialisation
+  TransformImpl<12, 1, true, 2, 2>::Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride*2, x0*2, xmax*2, k0, kmax
+  );
 }
 
 // Generic 12x16-bit sized specialisation
 template <>
 template <typename T>
 inline void TransformImpl<12, 1, true, 2, 2>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a uint16_t specialisation
-    Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride, x0, xmax, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a uint16_t specialisation
+  Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride, x0, xmax, k0, kmax
+  );
 }
 
 // Specialised 12 x uint16_t version
 template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
-{
-    __asm volatile(
-        "LDR q0, [%[in0]]\n"
-        "STR q0, [%[out]]\n"
-        "LDR d1, [%[in0], #0x10]\n"
-        "STR d1, [%[out], #0x10]\n"
-        "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
-        : [in0] "+r"(in0),
-        [out] "+r"(out)
-        :
-        : "v0", "v1", "memory");
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+  __asm volatile (
+    "LDR q0, [%[in0]]\n"
+    "STR q0, [%[out]]\n"
+    "LDR d1, [%[in0], #0x10]\n"
+    "STR d1, [%[out], #0x10]\n"
+    "ADD %x[in0], %x[in0], #0x18\n"
+    ASM_PREFETCH("[%[in0], #192]")
+    : [in0] "+r" (in0),
+      [out] "+r" (out)
+    :
+    : "v0", "v1", "memory"
+  );
 }
 
 template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
-{
-    __asm volatile(
-        "LDR q0, [%[in0]]\n"
-        "LDR d1, [%[in0], #0x10]\n"
-        "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
+  __asm volatile (
+    "LDR q0, [%[in0]]\n"
+    "LDR d1, [%[in0], #0x10]\n"
+    "ADD %x[in0], %x[in0], #0x18\n"
+    ASM_PREFETCH("[%[in0], #192]")
 
-        "LDR x21, [%[in1]]\n"
-        "LDR q2, [%[in1], #0x08]\n"
-        "INS v1.d[1], x21\n"
-        "ADD %x[in1], %x[in1], #0x18\n"
-        "STP q0, q1, [%[out]]\n"
-        "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]")
-        : [in0] "+r"(in0),
-        [in1] "+r"(in1),
-        [out] "+r"(out)
-        :
-        : "x21", "v0", "v1", "v2", "memory");
+    "LDR x21, [%[in1]]\n"
+    "LDR q2, [%[in1], #0x08]\n"
+    "INS v1.d[1], x21\n"
+    "ADD %x[in1], %x[in1], #0x18\n"
+    "STP q0, q1, [%[out]]\n"
+    "STR q2, [%x[out], #0x20]\n"
+    ASM_PREFETCH("[%[in1], #192]")
+    : [in0] "+r" (in0),
+      [in1] "+r" (in1),
+      [out] "+r" (out)
+    :
+    : "x21", "v0", "v1", "v2", "memory"
+  );
 }
 
 template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
-{
-    __asm __volatile(
-        "LDR q0, [%x[in0]], #0x10\n"
-        "STR q0, [%x[out]]\n"
-        "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]")
-        "STR d1, [%x[out], #0x10]\n"
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+  __asm __volatile (
+    "LDR q0, [%x[in0]], #0x10\n"
+    "STR q0, [%x[out]]\n"
+    "LDR d1, [%x[in0]], #0x08\n"
+    ASM_PREFETCH("[%[in0], #192]")
+    "STR d1, [%x[out], #0x10]\n"
 
-        "LDR q0, [%x[in1]], #0x10\n"
-        "STR q0, [%x[out], #0x18]\n"
-        "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]")
-        "STR d1, [%x[out], #0x28]\n"
+    "LDR q0, [%x[in1]], #0x10\n"
+    "STR q0, [%x[out], #0x18]\n"
+    "LDR d1, [%x[in1]], #0x08\n"
+    ASM_PREFETCH("[%[in1], #192]")
+    "STR d1, [%x[out], #0x28]\n"
 
-        "LDR q0, [%x[in2]], #0x10\n"
-        "STR q0, [%x[out], #0x30]\n"
-        "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]")
-        "STR d1, [%x[out], #0x40]\n"
+    "LDR q0, [%x[in2]], #0x10\n"
+    "STR q0, [%x[out], #0x30]\n"
+    "LDR d1, [%x[in2]], #0x08\n"
+    ASM_PREFETCH("[%[in2], #192]")
+    "STR d1, [%x[out], #0x40]\n"
 
-        "LDR q0, [%x[in3]], #0x10\n"
-        "STR q0, [%x[out], #0x48]\n"
-        "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n"
-        : [in0] "+r"(in0),
-        [in1] "+r"(in1),
-        [in2] "+r"(in2),
-        [in3] "+r"(in3),
-        [out] "+r"(out)
-        :
-        : "v0", "v1", "memory");
+    "LDR q0, [%x[in3]], #0x10\n"
+    "STR q0, [%x[out], #0x48]\n"
+    "LDR d1, [%x[in3]], #0x08\n"
+    ASM_PREFETCH("[%[in3], #192]")
+    "STR d1, [%x[out], #0x58]\n"
+    : [in0] "+r" (in0),
+      [in1] "+r" (in1),
+      [in2] "+r" (in2),
+      [in3] "+r" (in3),
+      [out] "+r" (out)
+    :
+    : "v0", "v1", "memory"
+  );
 }
 
 template <>
 template <>
 inline void TransformImpl<12, 1, true, 2, 2>::Transform(
-    uint16_t *out, const uint16_t *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+    uint16_t* out, const uint16_t* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
 }
 
 #endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
index b79f32f..2f90c18 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp

@@ -28,86 +28,93 @@
 #include "transpose_interleave_common.hpp"
 
 template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out)
-{
-    __asm __volatile(
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) {
+    __asm __volatile (
         "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2    v1.4s, v0.8h\n"
-        "FCVTL    v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL    v2.4s, v2.4h\n"
-        "STR    q2, [%[out], #32]\n"
-        : [in0] "+r"(in0), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "memory");
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out)
-{
-    __asm __volatile(
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2    v1.4s, v0.8h\n"
-        "FCVTL    v0.4s, v0.4h\n"
-        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDR    d2, [%[in0]], #8\n"
-        "FCVTL    v2.4s, v2.4h\n"
-        "LDR    q3, [%[in1]], #16\n"
-        "FCVTL2    v4.4s, v3.8h\n"
-        "FCVTL    v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
-        "LDR    d5, [%[in1]], #16\n"
-        "FCVTL    v5.4s, v5.4h\n"
-        "STP    q4, q5, [%[out], #64]\n"
-        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out)
-{
-    __asm __volatile(
-        "LDR    q0, [%[in0]], #16\n"
-        "FCVTL2    v1.4s, v0.8h\n"
-        "FCVTL    v0.4s, v0.4h\n"
+        "FCVTL2	v1.4s, v0.8h\n"
+        "FCVTL	v0.4s, v0.4h\n"
         "STP    q0, q1, [%[out]]\n"
-        "LDR    d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]")
-        "FCVTL    v2.4s, v2.4h\n"
-        "LDR    q3, [%[in1]], #16\n"
-        "FCVTL2    v4.4s, v3.8h\n"
-        "FCVTL    v3.4s, v3.4h\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR    d5, [%[in1]], #8\n"
-        "FCVTL    v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDR    q6, [%[in2]], #16\n"
-        "FCVTL2    v7.4s, v6.8h\n"
-        "FCVTL    v6.4s, v6.4h\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR    d8, [%[in2]], #8\n"
-        "FCVTL    v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]")
-        "LDR    q9, [%[in3]], #16\n"
-        "FCVTL2    v10.4s, v9.8h\n"
-        "FCVTL    v9.4s, v9.4h\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR    d11, [%[in3]], #8\n"
-        "FCVTL    v11.4s, v11.4h\n"
-        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    d2, [%[in0]], #8\n"
+        "FCVTL	v2.4s, v2.4h\n"
+        "STR    q2, [%[out], #32]\n"
+    : [in0] "+r" (in0), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "memory"
+    );
+}
 
-        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) {
+    __asm __volatile (
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2	v1.4s, v0.8h\n"
+        "FCVTL	v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    d2, [%[in0]], #8\n"
+        "FCVTL	v2.4s, v2.4h\n"
+        "LDR	q3, [%[in1]], #16\n"
+        "FCVTL2	v4.4s, v3.8h\n"
+        "FCVTL	v3.4s, v3.4h\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "LDR	d5, [%[in1]], #16\n"
+        "FCVTL	v5.4s, v5.4h\n"
+        "STP    q4, q5, [%[out], #64]\n"
+    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
+    );
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) {
+    __asm __volatile (
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2	v1.4s, v0.8h\n"
+        "FCVTL	v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    d2, [%[in0]], #8\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "FCVTL	v2.4s, v2.4h\n"
+        "LDR	q3, [%[in1]], #16\n"
+        "FCVTL2	v4.4s, v3.8h\n"
+        "FCVTL	v3.4s, v3.4h\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR	d5, [%[in1]], #8\n"
+        "FCVTL	v5.4s, v5.4h\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDR	q6, [%[in2]], #16\n"
+        "FCVTL2	v7.4s, v6.8h\n"
+        "FCVTL	v6.4s, v6.4h\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR	d8, [%[in2]], #8\n"
+        "FCVTL	v8.4s, v8.4h\n"
+        ASM_PREFETCH("[%[in2], #192]")
+        "LDR	q9, [%[in3]], #16\n"
+        "FCVTL2	v10.4s, v9.8h\n"
+        "FCVTL	v9.4s, v9.4h\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR	d11, [%[in3]], #8\n"
+        "FCVTL	v11.4s, v11.4h\n"
+        "STP    q10, q11, [%[out], #160]\n"
+        ASM_PREFETCH("[%[in3], #192]")
+
+    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
+    );
 }
 
 template <>
 template <>
 inline void TransformImpl<12, 1, true, 4, 2>::Transform(
-    float *out, const __fp16 *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
+    float* out, const __fp16* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
 }
 
 #endif // __aarch64__ && __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index 5434599..b6565ba 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp

@@ -31,91 +31,100 @@
 template <>
 template <typename T>
 inline void TransformImpl<12, 1, true, 4, 4>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a 24 x uint16_t specialisation
-    TransformImpl<24, 1, true, 2, 2>::Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a 24 x uint16_t specialisation
+  TransformImpl<24, 1, true, 2, 2>::Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride*2, x0*2, xmax*2, k0, kmax
+  );
 }
 
 // Generic 24x16-bit sized specialisation
 template <>
 template <typename T>
 inline void TransformImpl<24, 1, true, 2, 2>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a uint16_t specialisation
-    Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride, x0, xmax, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a uint16_t specialisation
+  Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride, x0, xmax, k0, kmax
+  );
 }
 
 // Specialised 24 x uint16_t version
 template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
-{
-    __asm __volatile(
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "STR    q2, [%[out], #32]\n"
-        : [in0] "+r"(in0), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "memory");
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
-{
-    __asm __volatile(
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "LDP    q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
-        "LDR    q5, [%[in1]], #16\n"
-        "STP    q4, q5, [%[out], #64]\n"
-        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
-{
-    __asm __volatile(
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+    __asm __volatile (
         "LDP    q0, q1, [%[in0]], #32\n"
         "STP    q0, q1, [%[out]]\n"
-        "LDR    q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDP    q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR    q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDP    q6, q7, [%[in2]], #32\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR    q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]")
-        "LDP    q9, q10, [%[in3]], #32\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR    q11, [%[in3]], #16\n"
-        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "STR    q2, [%[out], #32]\n"
+    : [in0] "+r" (in0), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "memory"
+    );
+}
 
-        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "LDP	q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "LDR	q5, [%[in1]], #16\n"
+        "STP    q4, q5, [%[out], #64]\n"
+    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
+    );
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    q2, [%[in0]], #16\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDP	q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR	q5, [%[in1]], #16\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDP	q6, q7, [%[in2]], #32\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR	q8, [%[in2]], #16\n"
+        ASM_PREFETCH("[%[in2], #192]")
+        "LDP	q9, q10, [%[in3]], #32\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR	q11, [%[in3]], #16\n"
+        "STP    q10, q11, [%[out], #160]\n"
+        ASM_PREFETCH("[%[in3], #192]")
+
+    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
+    );
 }
 
 template <>
 template <>
 inline void TransformImpl<24, 1, true, 2, 2>::Transform(
-    uint16_t *out, const uint16_t *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+    uint16_t* out, const uint16_t* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
 }
 
-#endif // __arch64__
+#endif  // __arch64__

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index 3218ca1..63e85c1 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp

@@ -24,137 +24,117 @@
 #pragma once
 
 template <unsigned int IntBy, typename TIn, typename TOut>
-struct TransposeInterleaveCommon
-{
-    // Override the moveblock_1xY methods to improve performance
-    static inline void moveblock_1x1(const TIn *&in0, TOut *out)
-    {
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in0++);
+struct TransposeInterleaveCommon {
+  // Override the moveblock_1xY methods to improve performance
+  static inline void moveblock_1x1(const TIn *&in0, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+  }
+
+  static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in1++);
+    }
+  }
+
+  static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in1++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in2++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in3++);
+    }
+  }
+
+  static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
+    const auto ldin = stride;
+
+    TOut *outarray = out;
+    const TIn *inarray = in;
+    TOut *outptr_base = outarray;
+    const TIn *inptr_base = inarray + x0 + (k0 * ldin);
+    int ldout = (kmax - k0) * IntBy;
+
+    int k=(kmax-k0);
+    for ( ; k>3; k-=4) {
+        TOut *outptr = outptr_base;
+        const TIn *inptr = inptr_base;
+        const TIn *inptr1 = inptr + ldin;
+        const TIn *inptr2 = inptr1 + ldin;
+        const TIn *inptr3 = inptr2 + ldin;
+
+        prefetch_3x(inptr);
+        prefetch_3x(inptr1);
+        prefetch_3x(inptr2);
+        prefetch_3x(inptr3);
+
+        outptr_base += IntBy * 4;
+        inptr_base += ldin * 4;
+
+        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+            moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+            outptr += ldout;
         }
     }
 
-    static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out)
-    {
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in0++);
-        }
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in1++);
-        }
-    }
+    if (k) {
+        TOut *outptr = outptr_base;
+        const TIn *inptr = inptr_base;
+        const TIn *inptr1 = inptr + ldin;
+        const TIn *inptr2 = inptr1 + ldin;
 
-    static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out)
-    {
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in0++);
-        }
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in1++);
-        }
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in2++);
-        }
-        for(unsigned int i = 0; i < IntBy; i++)
-        {
-            *out++ = static_cast<TOut>(*in3++);
-        }
-    }
+        prefetch_3x(inptr);
+        prefetch_3x(inptr1);
+        prefetch_3x(inptr2);
 
-    static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax)
-    {
-        const auto ldin = stride;
+        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+            switch(k) {
+                case 3:
+                    moveblock_1x2(inptr, inptr1, outptr);
+                    moveblock_1x1(inptr2, outptr + IntBy * 2);
+                    break;
 
-        TOut      *outarray    = out;
-        const TIn *inarray     = in;
-        TOut      *outptr_base = outarray;
-        const TIn *inptr_base  = inarray + x0 + (k0 * ldin);
-        int        ldout       = (kmax - k0) * IntBy;
+                case 2:
+                    moveblock_1x2(inptr, inptr1, outptr);
+                    break;
 
-        int k = (kmax - k0);
-        for(; k > 3; k -= 4)
-        {
-            TOut      *outptr = outptr_base;
-            const TIn *inptr  = inptr_base;
-            const TIn *inptr1 = inptr + ldin;
-            const TIn *inptr2 = inptr1 + ldin;
-            const TIn *inptr3 = inptr2 + ldin;
+                case 1:
+                    moveblock_1x1(inptr, outptr);
+                    break;
 
-            prefetch_3x(inptr);
-            prefetch_3x(inptr1);
-            prefetch_3x(inptr2);
-            prefetch_3x(inptr3);
-
-            outptr_base += IntBy * 4;
-            inptr_base += ldin * 4;
-
-            for(int x = (xmax - x0) / IntBy; x > 0; x--)
-            {
-                moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
-                outptr += ldout;
+                default:
+                    UNREACHABLE("Impossible.");
             }
+
+            outptr  += ldout;
         }
+    }
 
-        if(k)
-        {
-            TOut      *outptr = outptr_base;
-            const TIn *inptr  = inptr_base;
-            const TIn *inptr1 = inptr + ldin;
-            const TIn *inptr2 = inptr1 + ldin;
+    // Cope with ragged X cases
+    const unsigned int overflow = (xmax - x0) % IntBy;
+    if (overflow) {
+        const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+        TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
 
-            prefetch_3x(inptr);
-            prefetch_3x(inptr1);
-            prefetch_3x(inptr2);
+        for (int k=(kmax-k0); k>0; k--) {
+            const TIn *inptr = inptr_base;
+            inptr_base += ldin;
 
-            for(int x = (xmax - x0) / IntBy; x > 0; x--)
-            {
-                switch(k)
-                {
-                    case 3:
-                        moveblock_1x2(inptr, inptr1, outptr);
-                        moveblock_1x1(inptr2, outptr + IntBy * 2);
-                        break;
-
-                    case 2:
-                        moveblock_1x2(inptr, inptr1, outptr);
-                        break;
-
-                    case 1:
-                        moveblock_1x1(inptr, outptr);
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-
-                outptr += ldout;
-            }
-        }
-
-        // Cope with ragged X cases
-        const unsigned int overflow = (xmax - x0) % IntBy;
-        if(overflow)
-        {
-            const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
-            TOut      *outptr     = outarray + ((xmax - x0) / IntBy) * ldout;
-
-            for(int k = (kmax - k0); k > 0; k--)
-            {
-                const TIn *inptr = inptr_base;
-                inptr_base += ldin;
-
-                for(unsigned int x = 0; x < IntBy; x++)
-                {
-                    TOut val  = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
-                    *outptr++ = val;
-                }
+            for (unsigned int x=0; x < IntBy; x++) {
+                TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+                *outptr++ = val;
             }
         }
     }
+}
 };

diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 6c5b92a..c1977d5 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,27 +25,22 @@
 #pragma once
 
 // Macro for unreachable code (e.g. impossible default cases on switch)
-#define UNREACHABLE(why) __builtin_unreachable()
+#define UNREACHABLE(why)  __builtin_unreachable()
 
 // Paranoid option for the above with assert
 // #define UNREACHABLE(why)   assert(0 && why)
 
-inline int iceildiv(const int a, const int b)
-{
-    return (a + b - 1) / b;
+inline int iceildiv(const int a, const int b) {
+  return (a + b - 1) / b;
 }
 
 template <typename T>
-inline T roundup(const T a, const T b)
-{
-    T rem = a % b;
+inline T roundup(const T a, const T b) {
+  T rem = a % b;
 
-    if(rem)
-    {
-        return a + b - rem;
-    }
-    else
-    {
-        return a;
-    }
+  if (rem) {
+    return a + b - rem;
+  } else {
+    return a;
+  }
 }
commit	5f707736413aeac77818c42838296966f8dc6761	[log] [tgz]
author	Anthony Barbier <anthony.barbier@arm.com>	Tue Jul 03 16:22:02 2018 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Fri Nov 02 16:54:10 2018 +0000
tree	b829ed3243ea5f3085f288836132416c78bc2e72
parent	7485d5a62685cb745ab50e970adb722cb71557ac [diff]