COMPMID-881: RSH new arm_gemm interface.

Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
new file mode 100644
index 0000000..b3fcb33
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+// Macro to use in assembler to get a preload.  Needed because of various
+// workarounds needed to get working preload behaviour.
+//
+// Code using these macros needs to clobber x20 and x21 as they might be
+// used by the workaround.
+
+// "Correct" version
+#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+
+// Lee's uarchsim hack
+//#define ASM_PREFETCH(address)    "LDNP x20, x21, " address "\n"
+
+// No preload at all
+//#define ASM_PREFETCH(address) ""
+#else
+
+// "Correct" versions for AArch32
+#define ASM_PREFETCH(address) "PLD " address "\n"
+#define ASM_PREFETCHW(address) "PLDW " address "\n"
+
+#endif
+
+/*
+ * Do some prefetches.
+ */
+template <typename T>
+static inline void prefetch_6x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+        ASM_PREFETCH("[%[pfp], #320]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_5x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_4x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_3x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_2x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
+
+template <typename T>
+static inline void prefetch_1x(const T *pfp)
+{
+    __asm __volatile(
+        ASM_PREFETCH("[%[pfp]]")
+        :
+        : [pfp] "r"(pfp)
+        : "memory");
+}
diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
new file mode 100644
index 0000000..dd74744
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+#ifndef NO_MULTI_THREADING
+#include <atomic>
+#include <mutex>
+
+#define USE_SEMAPHORE
+
+#ifdef USE_SEMAPHORE
+#include <condition_variable>
+#endif
+
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+enum class BufferStatus
+{
+    IDLE,
+    POPULATING,
+    BUSY
+};
+
+class Buffer
+{
+private:
+    const int   _maxusers; // Maximum permissible threads.
+    void *const _storage;  // Storage for buffer content.
+
+    int _numusers; // Actual number of threads (might be lower).
+
+    volatile BufferStatus _status = BufferStatus::IDLE; // Status
+    std::atomic_int       _users  = {};                 // How many users are still using the buffer.
+    volatile int          _index  = 0;                  // Which block of data currently resides in the buffer.
+
+    std::mutex _lock = {};
+#ifdef USE_SEMAPHORE
+    std::condition_variable _cv = {};
+#endif
+
+    template <typename T>
+    void populate_buffer(T func)
+    {
+        func(_storage);
+
+        /* Now mark it as ready. */
+#ifdef USE_SEMAPHORE
+        {
+            std::unique_lock<std::mutex> ul(_lock);
+            _status = BufferStatus::BUSY;
+            _cv.notify_all();
+        }
+#else
+        _status     = BufferStatus::BUSY;
+#endif
+    }
+
+public:
+    Buffer(Buffer &) = delete;
+    Buffer &operator=(Buffer &) = delete;
+
+    Buffer(void *storage, int maxusers)
+        : _maxusers(maxusers), _storage(storage), _numusers(maxusers)
+    {
+        _status = BufferStatus::IDLE;
+    }
+
+    /* Try and populate the given index.
+     * Wait if the buffer is busy with previous index, then:
+     *
+     * If the buffer is idle, grab it and populate it.
+     * If it's already being populated by another thread or is ready, return.
+     */
+    template <typename T>
+    void try_populate(const int index, T func)
+    {
+        for(;;)
+        {
+#ifdef USE_SEMAPHORE
+            /* If it's busy with a previous index, wait on the semaphore. */
+            if((_status == BufferStatus::BUSY) && (_index != index))
+            {
+                std::unique_lock<std::mutex> ul(_lock);
+
+                if((_status == BufferStatus::BUSY) && (_index != index))
+                {
+                    _cv.wait(ul);
+                }
+            }
+#endif
+            /* Return if another thread is populating it already. */
+            if((_index == index) && ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY)))
+            {
+                return;
+            }
+
+            if(_status == BufferStatus::IDLE)
+            {
+                std::lock_guard<std::mutex> guard(_lock);
+
+                /* If the buffer is still idle, we can grab it and populate it. */
+                if(_status == BufferStatus::IDLE)
+                {
+                    _status = BufferStatus::POPULATING;
+                    _index  = index;
+                    _users  = _numusers;
+                    break;
+                }
+            }
+        }
+
+        /* If we get here, fill in the buffer. */
+        populate_buffer(func);
+    }
+
+    template <typename T>
+    void *get(const int index, T func)
+    {
+        // Loop until we achieve something.
+        for(;;)
+        {
+            // If the index is correct and the buffer status is busy then we can
+            // just return the content.  No locking is needed here as the index
+            // cannot change (and status cannot change from BUSY) until all
+            // users have finished.
+            if((_index == index) && (_status == BufferStatus::BUSY))
+            {
+                return _storage;
+            }
+#ifdef USE_SEMAPHORE
+            if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+            {
+                std::unique_lock<std::mutex> ul(_lock);
+
+                if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+                {
+                    _cv.wait(ul);
+                }
+            }
+#endif
+
+            // If it's idle, we need to populate it.  The IDLE->POPULATING
+            // transition requires the lock.
+            if(_status == BufferStatus::IDLE)
+            {
+                std::lock_guard<std::mutex> guard(_lock);
+
+                /* If it's still idle, grab it.  Otherwise drop through and
+                 * we'll do something else next time through the loop.  */
+                if(_status == BufferStatus::IDLE)
+                {
+                    _status = BufferStatus::POPULATING;
+                    _index  = index;
+                    _users  = _numusers;
+                    break;
+                }
+            }
+        }
+
+        /* If we get here we need to populate the buffer. */
+        populate_buffer(func);
+
+        return _storage;
+    }
+
+    /* Threads call this when they have finished processing a buffer.  We
+     * simply (atomically) decrement the user count, and if it's hit zero we
+     * flag the buffer as idle.
+     */
+    void release(void)
+    {
+        if(--_users == 0)
+        {
+#ifdef USE_SEMAPHORE
+            std::unique_lock<std::mutex> ul(_lock);
+            _status = BufferStatus::IDLE;
+            /* We notify all waiters as we expect one to do the populating
+             * and any others to go and process and earlier block.  */
+            _cv.notify_all();
+#else
+            _status = BufferStatus::IDLE;
+#endif
+        }
+    }
+
+    /* This is called to change the number of users. */
+    void set_numusers(int numusers)
+    {
+        _numusers = std::min(numusers, _maxusers);
+    }
+};
+
+class BufferManager
+{
+private:
+    /* This has to be a vector of Buffer *, because a Buffer cannot be moved
+     * or copied due to atomic members. */
+    std::vector<Buffer *> _buffers = {};
+    const int             _maxthreads;
+    void *const           _storage;
+
+public:
+    BufferManager(BufferManager &) = delete;
+    BufferManager &operator=(BufferManager &) = delete;
+
+    // Say how much storage is needed.
+    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+    {
+        return buffersize * ((maxthreads == 1) ? 1 : 3);
+    }
+
+    BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+        : _maxthreads(maxthreads), _storage(storage)
+    {
+        const int numbuffers = (maxthreads == 1) ? 1 : 3;
+
+        /* We don't need any Buffer objects in single thread mode. */
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        /* Use intptr_t to avoid performing arithmetic on a void * */
+        intptr_t storage_int = reinterpret_cast<intptr_t>(_storage);
+
+        for(int i = 0; i < numbuffers; i++)
+        {
+            _buffers.push_back(new Buffer(reinterpret_cast<void *>(storage_int), _maxthreads));
+            storage_int += buffersize;
+        }
+    }
+
+    ~BufferManager()
+    {
+        while(_buffers.size())
+        {
+            delete _buffers.back();
+            _buffers.pop_back();
+        }
+    }
+
+    template <typename T>
+    void *get(const int index, T func)
+    {
+        /* In single thread mode, we just directly call the populating
+         * function on the (single) buffer, otherwise forward to the
+         * relevant Buffer.  */
+        if(_maxthreads == 1)
+        {
+            func(_storage);
+            return _storage;
+        }
+        else
+        {
+            return _buffers[index % _buffers.size()]->get(index, func);
+        }
+    }
+
+    template <typename T>
+    void try_populate(const int index, T func)
+    {
+        /* No need for this in single thread mode. */
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        _buffers[index % _buffers.size()]->try_populate(index, func);
+    }
+
+    void release(const int index)
+    {
+        /* No need for this in single thread mode. */
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        _buffers[index % _buffers.size()]->release();
+    }
+
+    void set_nthreads(int threads)
+    {
+        if(_maxthreads == 1)
+        {
+            return;
+        }
+
+        for(unsigned int i = 0; i < _buffers.size(); i++)
+        {
+            _buffers[i]->set_numusers(threads);
+        }
+    }
+};
+
+#else
+
+/* Trivial implementation if threading is disabled at compile time.
+ *
+ * Here, we only need storage for a single buffer.  The 'get' method needs
+ * to call the supplied function to populate the buffer and then return it.
+ * All the other methods do nothing.
+ */
+
+class BufferManager
+{
+private:
+    void *const _storage;
+
+public:
+    BufferManager(BufferManager &) = delete;
+    BufferManager &operator=(BufferManager &) = delete;
+
+    BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+        : _storage(storage)
+    {
+    }
+
+    ~BufferManager()
+    {
+    }
+
+    // Say how much storage is needed.
+    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+    {
+        return buffersize;
+    }
+
+    template <typename T>
+    void try_populate(const int index, T func)
+    {
+    }
+
+    void release(const int index)
+    {
+    }
+
+    template <typename T>
+    void *get(const int index, T func)
+    {
+        func(_storage);
+        return _storage;
+    }
+
+    void set_nthreads(int)
+    {
+    }
+};
+
+#endif
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
new file mode 100644
index 0000000..b9729d4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+
+#include "arm_gemm.hpp"
+
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hgemm_24x8.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<__fp16, __fp16> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                      const bool trA, const bool trB, const __fp16 alpha, const __fp16 beta,
+                                      const int maxthreads, const bool pretransposed_hint)
+{
+#ifdef __aarch64__
+    /* If FP16 is supported, use it */
+    if(ci.has_fp16())
+    {
+        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+
+    /* Fallback to using the blocked SGEMM kernel. */
+    return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+    /* For AArch32, only support the SGEMM route. */
+    return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class members
+#ifdef __aarch64__
+const int hgemm_24x8::out_width;
+const int hgemm_24x8::out_height;
+#endif
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
new file mode 100644
index 0000000..1baa21f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
+#include "gemv_native_transposed.hpp"
+#include "gemv_pretransposed.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<float, float> gemm<float, float>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                  const bool trA, const bool trB, const float alpha, const float beta,
+                                                  const int maxthreads, const bool pretransposed_hint)
+{
+#ifdef __aarch64__
+    /* Cases in priority order */
+    /* GemvPretransposed: requires M=1, alpha=1, and transposed hint set */
+    if(M == 1 && alpha == 1.0f && pretransposed_hint)
+    {
+        return UniqueGemmCommon<float, float>(new GemvPretransposed<sgemv_pretransposed, float, float>(&ci, N, K, trB, beta));
+    }
+
+    /* GemvNativeTransposed: requires M=1, no trA or trB, doesn't handle beta */
+    if(M == 1 && beta == 1.0f && !trA && !trB)
+    {
+        return UniqueGemmCommon<float, float>(new GemvNativeTransposed<sgemv_trans, float, float>(&ci, N, K, alpha));
+    }
+
+    /* Native GEMM: requires M to be a multiple of 4, K a multiple of 4, N a
+     * multiple of 16, doesn't handle alpha and only makes sense for small
+     * sizes.  */
+    if(N <= 128 && K <= 128 && ((M % 4) == 0) && ((K % 4) == 0) && ((N % 16) == 0) && alpha == 1.0f)
+    {
+        return UniqueGemmCommon<float, float>(new GemmNative<sgemm_native_16x4, float, float>(&ci, M, N, K, beta));
+    }
+
+    /* Blocked GEMM, handles all cases. */
+    return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_12x8, float, float>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+    return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_8x6, float, float>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class variables.
+#ifdef __aarch64__
+const int sgemm_12x8::out_width;
+const int sgemm_12x8::out_height;
+
+const int sgemm_native_16x4::out_width;
+const int sgemm_native_16x4::out_height;
+#else
+const int sgemm_8x6::out_width;
+const int sgemm_8x6::out_height;
+#endif
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
new file mode 100644
index 0000000..344bfed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                          const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+                                                          const int maxthreads, const bool pretransposed_hint)
+{
+    return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_s16_12x8::out_width;
+const int gemm_s16_12x8::out_height;
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
new file mode 100644
index 0000000..856d407
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                        const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+                                                        const int maxthreads, const bool pretransposed_hint)
+{
+    if(ci.has_dotprod())
+    {
+        // Dot product supporting CPUs.  This family has a special version for A55r1.
+        return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+
+    return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+
+    // TODO: There's a better approach for A53, but it doesn't work
+    // well on heterogeneous systems as the required data formats
+    // are different.  Figure out how to enable this:
+    // gemm = new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(ci, M, N, K, trA, trB);
+}
+
+// Instantiate static class members
+const int gemm_s8_12x8::out_width;
+const int gemm_s8_12x8::out_height;
+const int gemm_s8_4x4::out_width;
+const int gemm_s8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
new file mode 100644
index 0000000..27e4e8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "buffer_manager.hpp"
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm
+{
+template <typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo *const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const bool _trA;
+    const bool _trB;
+
+    const Tr _alpha;
+    const Tr _beta;
+
+    const unsigned int _maxthreads;
+    const bool         _pretransposed;
+
+    /* Blocking info */
+    unsigned int _k_block = 0;
+    unsigned int _x_block = 0;
+    unsigned int _Mround  = 0;
+
+    /* Working space, pretransposed buffer, buffer manager */
+    const Toi     *_B_transposed  = nullptr;
+    BufferManager *_bm            = nullptr;
+    void          *_working_space = nullptr;
+
+    /* We will need to walk through the blocks of B in a few contexts, so
+     * factor that out.  */
+    class blockwalker
+    {
+    private:
+        /* Loop parameters, we only block up N and K so don't worry about M. */
+        const unsigned int _Nsize, _Ksize, _x_block, _k_block;
+
+        /* K and X parameters for current iteration. */
+        unsigned int _k0 = 0, _x0 = 0;
+
+        unsigned int _index     = 0;
+        bool         _done      = false;
+        bool         _newkblock = true;
+
+    public:
+        blockwalker(const unsigned int K, const unsigned int k_block, const unsigned int N, const unsigned int x_block)
+            : _Nsize(N), _Ksize(K), _x_block(x_block), _k_block(k_block)
+        {
+        }
+
+        unsigned int xmax()
+        {
+            return std::min(_x0 + _x_block, _Nsize);
+        }
+
+        unsigned int kmax()
+        {
+            return std::min(_k0 + _k_block, _Ksize);
+        }
+
+        /* Advance to the next block, return false at the end. */
+        bool advance(void)
+        {
+            if(_done)
+            {
+                return false;
+            }
+
+            _newkblock = false;
+            _x0 += _x_block;
+            if(_x0 >= _Nsize)
+            {
+                _x0 = 0;
+                _k0 += _k_block;
+                if(_k0 >= _Ksize)
+                {
+                    _done = true;
+                    return false;
+                }
+                _newkblock = true;
+            }
+            _index++;
+
+            return true;
+        }
+
+        unsigned int k0(void)
+        {
+            return _k0;
+        }
+        unsigned int x0(void)
+        {
+            return _x0;
+        }
+        unsigned int index(void)
+        {
+            return _index;
+        }
+        bool done(void)
+        {
+            return _done;
+        }
+        bool newkblock(void)
+        {
+            return _newkblock;
+        }
+    };
+
+    // A working size: One of these needed, regardless of thread count.  Divided according to window.
+    size_t get_a_working_size() const
+    {
+        return ROUND_UP(sizeof(Toi) * _k_block * _Mround);
+    }
+
+    // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
+    size_t get_b_working_size() const
+    {
+        return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
+    }
+
+    // C working size: One needed per thread.
+    size_t get_c_working_size() const
+    {
+        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height);
+    }
+
+    // Internal execute function.
+    // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
+    template <bool pretransposed>
+    void execute_internal(unsigned int start, unsigned int end, int threadid)
+    {
+        profiler prof;
+        strategy strat(_ci);
+
+        blockwalker current(_Ksize, _k_block, _Nsize, _x_block);
+        blockwalker next = current;
+
+        /* Compute the M values to operate on */
+        unsigned int m_0   = start * strategy::out_height;
+        unsigned int m_max = std::min(end * strategy::out_height, _Msize);
+
+        /* Make sure we've been set up correctly. */
+        if(pretransposed)
+        {
+            assert(_B_transposed);
+        }
+        else
+        {
+            assert(_bm);
+        }
+
+        assert(_working_space);
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+        // Private buffers.  Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer.
+        Toi *const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) + (m_0 * _k_block * sizeof(Toi)));
+        Tri *const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+
+        // Shared buffers - these come either from BufferManager or _B_transposed.
+        const Toi *b_panel;
+
+        if(pretransposed)
+        {
+            b_panel = _B_transposed;
+        }
+
+        //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+
+        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+        int kern_k = 0;
+
+        for(; !current.done(); current.advance())
+        {
+            if(current.newkblock())
+            {
+                prof(PROFILE_PREPA, ((m_max - m_0) * (current.kmax() - current.k0()) * sizeof(Toi)), [&](void)
+                {
+                    if(_trA ^ strategy::A_transpose)
+                    {
+                        Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, this->_Aptr, this->_lda, m_0, m_max, current.k0(), current.kmax());
+                    }
+                    else
+                    {
+                        Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, this->_Aptr, this->_lda, m_0, m_max, current.k0(), current.kmax());
+                    }
+                });
+
+                // Figure out how many "K" the kernel will actually process.
+                kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll);
+                kern_k *= strat.k_unroll;
+            }
+
+            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width);
+
+            if(!pretransposed)
+            {
+                /* Look ahead to the next block and populate it if necessary.
+                 * This avoids the populate operation becoming a bottleneck, and
+                 * helps keep the threads synchronized (the first thread to get
+                 * here will populate while the rest will advance).
+                 *
+                 * If we are running single threaded, bm->try_populate() will do
+                 * nothing.
+                 */
+                if(next.advance())
+                {
+                    _bm->try_populate(next.index(), [&](void *buffer)
+                    {
+                        prof(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi), [&](void)
+                        {
+                            Toi *b_panel = reinterpret_cast<Toi *>(buffer);
+                            if(_trB ^ strategy::B_transpose)
+                            {
+                                Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, this->_Bptr, this->_ldb, next.x0(), next.xmax(), next.k0(), next.kmax());
+                            }
+                            else
+                            {
+                                Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, this->_Bptr, this->_ldb, next.x0(), next.xmax(), next.k0(), next.kmax());
+                            }
+                        });
+                    });
+                }
+
+                /* Get the buffer for this iteration from the BufferManager. */
+                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv)
+                {
+                    prof(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi), [&](void)
+                    {
+                        Toi *b_panel = reinterpret_cast<Toi *>(bpv);
+                        if(_trB ^ strategy::B_transpose)
+                        {
+                            Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, this->_Bptr, this->_ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+                        }
+                        else
+                        {
+                            Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, this->_Bptr, this->_ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+                        }
+                    });
+                }));
+            }
+
+            /* Do the actual work. */
+            for(unsigned int y = m_0; y < m_max; y += strategy::out_height)
+            {
+                unsigned int ymax = std::min(_Msize, y + strategy::out_height);
+
+                prof(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k), [&](void)
+                {
+                    strat.kernel(a_panel + ((y - m_0) * kern_k), b_panel, c_panel, 1, bblocks, kern_k);
+                });
+                prof(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)), [&](void)
+                {
+                    MergeResults<strategy::out_width, strategy::out_height>(this->_Cptr, c_panel, this->_ldc, y, ymax,
+                                                                            current.x0(), current.xmax(), _alpha, (current.k0() == 0 ? _beta : static_cast<Tr>(1)));
+                });
+            }
+
+            if(pretransposed)
+            {
+                b_panel += (bblocks * strat.out_width * kern_k);
+            }
+            else
+            {
+                _bm->release(current.index());
+            }
+        }
+    }
+
+public:
+    GemmInterleaved(GemmInterleaved &) = delete;
+    GemmInterleaved &operator=(GemmInterleaved &) = delete;
+
+    /* Constructor */
+    GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                    const bool trA, const bool trB, const Tr alpha, const Tr beta, const int maxthreads,
+                    const bool pretransposed)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed)
+    {
+        const unsigned int L1_size = ci->get_L1_cache_size();
+        const unsigned int L2_size = ci->get_L2_cache_size();
+
+        assert(maxthreads > 0);
+
+        // Work out blocking parameters
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width, strategy::out_height)));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        _k_block /= strategy::k_unroll;
+        _k_block = std::max(_k_block, 1U) * strategy::k_unroll;
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        int num_k_blocks = iceildiv(K, _k_block);
+
+        // So divide the space equally into that many blocks.
+        _k_block = iceildiv(K, num_k_blocks);
+
+        // And round UP to the K unroll level required.
+        _k_block = iceildiv(_k_block, strategy::k_unroll);
+        _k_block *= strategy::k_unroll;
+
+        // x_block: Work out how many rows (of length k_block) will fit in the L2
+        // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block);
+
+        // Needs to be (at least a single) multiple of the kernel output width.
+        _x_block /= strategy::out_width;
+        _x_block = std::max(_x_block, 1U) * strategy::out_width;
+
+        // And tune to the presented problem size.
+        int num_x_blocks = iceildiv(N, _x_block);
+        _x_block         = iceildiv(N, num_x_blocks);
+
+        _x_block = iceildiv(_x_block, strategy::out_width);
+        _x_block *= strategy::out_width;
+
+        // Work out the rounded size of M - needed for some buffers.
+        _Mround = iceildiv(M, strategy::out_height);
+        _Mround *= strategy::out_height;
+    }
+
+    // Interface implementation - Compulsory functions
+
+    // Window size: Only the last thread should do a ragged block, so dole out work in units of out_height */
+    unsigned int get_window_size() const override
+    {
+        // _Mround is a multiple of out_height by definition.
+        return _Mround / strategy::out_height;
+    }
+
+    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+    void set_nthreads(int nthreads) override
+    {
+        if(_bm)
+        {
+            _bm->set_nthreads(nthreads);
+        }
+    }
+
+    // Execute
+    void execute(unsigned int start, unsigned int end, int threadid) override
+    {
+        if(_pretransposed)
+        {
+            execute_internal<true>(start, end, threadid);
+        }
+        else
+        {
+            execute_internal<false>(start, end, threadid);
+        }
+    }
+
+    // Interface implementation - working space
+    size_t get_working_size() const override
+    {
+        // In all cases, we need one A buffer plus a C buffer per thread.
+        size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+
+        // For pretransposed case, there is no working space needed for B.
+        // Otherwise, we need a BufferManager.
+        if(!_pretransposed)
+        {
+            size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+        }
+
+        size += 64; // Add on a cache line extra for alignment.
+
+        return size;
+    }
+
+    void set_working_space(void *working_space) override
+    {
+        // Make sure everything ends up cache line aligned
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+        intptr_t working_space_int   = reinterpret_cast<intptr_t>(working_space);
+
+        size_t diff = 0;
+
+        if(working_space_int & 0x3F)
+        {
+            diff = 0x40 - (working_space_int & 0x3F);
+        }
+
+        working_space_bytes += diff;
+
+        if(_pretransposed)
+        {
+            // Pretransposed case: just set internal pointer to parameter value.
+            _working_space = reinterpret_cast<void *>(working_space_bytes);
+        }
+        else
+        {
+            // Otherwise, use the first part of the working space for the buffer manager.
+            // It's legal to call this again so don't leak a buffer manager if it already existed.
+            delete _bm;
+
+            _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
+
+            working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+
+            _working_space = reinterpret_cast<void *>(working_space_bytes);
+        }
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override
+    {
+        return _pretransposed;
+    }
+
+    bool B_pretranspose_required() const override
+    {
+        return _pretransposed && (_B_transposed == nullptr);
+    }
+
+    // TODO: this could almost certainly be considerably simpler.
+    size_t get_B_pretransposed_array_size() const override
+    {
+        size_t      total = 0;
+        blockwalker current(_Ksize, _k_block, _Nsize, _x_block);
+
+        do
+        {
+            /* Figure out the size of each block. */
+            size_t x_size = (current.xmax() - current.x0());
+            size_t k_size = (current.kmax() - current.k0());
+
+            /* Round sizes up as needed. */
+            x_size = iceildiv(x_size, strategy::out_width);
+            x_size *= strategy::out_width;
+
+            k_size = iceildiv(k_size, strategy::k_unroll);
+            k_size *= strategy::k_unroll;
+
+            total += x_size * k_size * sizeof(Toi);
+        }
+        while(current.advance());
+
+        return total;
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb) override
+    {
+        blockwalker current(_Ksize, _k_block, _Nsize, _x_block);
+        Toi        *buffer = reinterpret_cast<Toi *>(in_buffer);
+        _B_transposed      = buffer;
+
+        do
+        {
+            /* Figure out the size of each block. */
+            size_t x_size = (current.xmax() - current.x0());
+            size_t k_size = (current.kmax() - current.k0());
+
+            /* Round sizes up as needed. */
+            x_size = iceildiv(x_size, strategy::out_width);
+            x_size *= strategy::out_width;
+
+            k_size = iceildiv(k_size, strategy::k_unroll);
+            k_size *= strategy::k_unroll;
+
+            if(_trB ^ strategy::B_transpose)
+            {
+                Transform<strategy::B_interleave, strategy::B_block, true>(buffer, B, ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+            }
+            else
+            {
+                Transform<strategy::B_interleave, strategy::B_block, false>(buffer, B, ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+            }
+
+            buffer += (x_size * k_size);
+        }
+        while(current.advance());
+    }
+
+    ~GemmInterleaved() override
+    {
+        delete _bm;
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
new file mode 100644
index 0000000..b019279
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for native GEMM with no transposition.
+//
+// By default the source data is used in-place, but if type conversion is
+// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
+
+template <typename strategy, typename To, typename Tr>
+class GemmNative : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    Tr _beta;
+
+    const CPUInfo *const _ci;
+
+    unsigned int k_block = 0;
+    unsigned int n_block = 0;
+
+public:
+    GemmNative(GemmNative &) = delete;
+    GemmNative &operator=(GemmNative &) = delete;
+
+    GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const Tr beta)
+        : _Msize(M), _Nsize(N), _Ksize(K), _beta(beta), _ci(ci)
+    {
+        /* For now don't do any blocking. TODO: figure out if we should. */
+        k_block = K;
+        n_block = N;
+    }
+
+    // Window is number of out_height blocks
+    unsigned int get_window_size() const override
+    {
+        return iceildiv(_Msize, strategy::out_height);
+    }
+
+    // Actually execute the GEMM.
+    void execute(unsigned int start, unsigned int end, int) override
+    {
+        profiler prof;
+        strategy strat(_ci);
+
+        unsigned int M_start = start * strategy::out_height;
+        unsigned int M_end   = std::min(end * strategy::out_height, _Msize);
+
+        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+        for(unsigned int y0 = M_start; y0 < M_end; y0 += strategy::out_height)
+        {
+            unsigned int ymax = std::min(y0 + strategy::out_height, M_end);
+
+            prof(PROFILE_KERNEL, (ymax - y0) * _Nsize * _Ksize, [&](void)
+            {
+                strat.kernel(this->_Aptr + (y0 * this->_lda), this->_lda, this->_Bptr, this->_ldb, this->_Cptr + (y0 * this->_ldc), this->_ldc, _beta, (ymax - y0), _Nsize, _Ksize);
+            });
+        }
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
new file mode 100644
index 0000000..3e790e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u16_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                              const bool trA, const bool trB, uint32_t alpha, uint32_t beta,
+                                                              const int maxthreads, const bool pretransposed_hint)
+{
+    return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_u16_12x8::out_width;
+const int gemm_u16_12x8::out_height;
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
new file mode 100644
index 0000000..9ec479c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+                                                            const bool trA, const bool trB, const uint32_t alpha, const uint32_t beta,
+                                                            const int maxthreads, const bool pretransposed_hint)
+{
+    if(ci.has_dotprod())
+    {
+        // Dot product supporting CPUs.  This family has a special version for A55r1.
+        return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+    }
+
+    // Non dot-product code.
+    return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+
+    // TODO: There's a better approach for A53, but it doesn't work
+    // well on heterogeneous systems as the required data formats
+    // are different.  Figure out how to enable this:
+    // gemm = new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(ci, M, N, K, trA, trB);
+}
+
+// Instantiate static class members
+const int gemm_u8_12x8::out_width;
+const int gemm_u8_12x8::out_height;
+
+const int gemm_u8_4x4::out_width;
+const int gemm_u8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
new file mode 100644
index 0000000..c0b8862
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for a "native" (no-transform) GEMV with a
+// transposed matrix.
+//
+// As a native operation the source data is used in-place, so the internal
+// and external operand/result types must match.
+template <typename strategy, typename To, typename Tr>
+class GemvNativeTransposed : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const Tr _alpha;
+
+    const CPUInfo *const _ci;
+
+    unsigned int m_block = 0;
+    unsigned int n_block = 0;
+
+public:
+    GemvNativeTransposed(GemvNativeTransposed &) = delete;
+    GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete;
+
+    GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const Tr alpha)
+        : _Nsize(N), _Ksize(K), _alpha(alpha), _ci(ci)
+    {
+        /* For now don't do any blocking. TODO: figure out if we should. */
+        m_block = K;
+        n_block = N;
+    }
+
+    // Window is number of out_width blocks.
+    unsigned int get_window_size() const override
+    {
+        return iceildiv(_Nsize, strategy::out_width);
+    }
+
+    // Actually execute the GEMV.
+    void execute(unsigned int start, unsigned int end, int) override
+    {
+        profiler prof;
+        strategy strat(_ci);
+
+        unsigned int N_start = start * strategy::out_width;
+        unsigned int N_end   = std::min(end * strategy::out_width, _Nsize);
+
+        static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
+        static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
+
+        for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+        {
+            unsigned int mmax = std::min(m0 + m_block, _Ksize);
+
+            for(unsigned int n0 = N_start; n0 < N_end; n0 += n_block)
+            {
+                unsigned int nmax = std::min(n0 + n_block, N_end);
+
+                prof(PROFILE_KERNEL, ((mmax - m0) * (nmax - n0)), [&](void)
+                {
+                    strat.kernel(this->_Bptr + (m0 * this->_ldb) + n0, this->_Aptr + m0, this->_Cptr + n0,
+                                 _alpha, this->_ldb, (mmax - m0), (nmax - n0));
+                });
+            }
+        }
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
new file mode 100644
index 0000000..0df331a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for GEMV with a transposed matrix.
+//
+// By default the source data is used in-place, but if type conversion is
+// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
+
+template <typename strategy, typename To, typename Tr>
+class GemvPretransposed : public GemmCommon<To, Tr>
+{
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type  Tri;
+
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const bool _trB;
+
+    const Tr _beta;
+
+    const CPUInfo *const _ci;
+
+    unsigned int m_block = 0;
+    unsigned int n_block = 0;
+
+    const Toi *_A_pretransposed = nullptr;
+
+public:
+    GemvPretransposed(GemvPretransposed &) = delete;
+    GemvPretransposed &operator=(GemvPretransposed &) = delete;
+
+    GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const bool trB, const Tr beta)
+        : _Nsize(N), _Ksize(K), _trB(trB), _beta(beta), _ci(ci)
+    {
+        /* For now don't do any blocking. TODO: figure out if we should. */
+        m_block = K;
+        n_block = N;
+    }
+
+    // Window is number of out_width blocks.
+    unsigned int get_window_size() const override
+    {
+        return iceildiv(_Nsize, strategy::out_width);
+    }
+
+    // Actually execute the GEMV.
+    void execute(unsigned int start, unsigned int end, int) override
+    {
+        profiler prof;
+        strategy strat(_ci);
+
+        unsigned int N_start = start * strategy::out_width;
+        unsigned int N_end   = std::min(end * strategy::out_width, _Nsize);
+
+        static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
+
+        for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+        {
+            unsigned int mmax = std::min(m0 + m_block, _Ksize);
+
+            for(unsigned int n0 = N_start; n0 < N_end; n0 += n_block)
+            {
+                unsigned int nmax = std::min(n0 + n_block, N_end);
+
+                prof(PROFILE_KERNEL, ((mmax - m0) * (nmax - n0)), [&](void)
+                {
+                    /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
+                    strat.kernel(_A_pretransposed + (n0 * _Ksize) + (m0 * strategy::A_interleave), (_Ksize * strategy::A_interleave), this->_Aptr + m0, this->_Cptr + n0, _beta, (mmax - m0), (nmax - n0));
+                });
+            }
+        }
+    }
+
+    /* Pretransposed interface implementation */
+    bool B_is_pretransposed() const override
+    {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override
+    {
+        /* Transpose is required if _A_pretransposed is still nullptr */
+        return (_A_pretransposed == nullptr);
+    }
+
+    size_t get_B_pretransposed_array_size() const override
+    {
+        return _Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave * sizeof(float);
+    }
+
+    void pretranspose_B_array(void *buffer, const To *B, const int ldb) override
+    {
+        Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
+
+        /* Reverse sense here as we are dealing with B rather than A.  So if
+         * strategy::A_transpose is false and _trB is false, we still
+         * transpose.  */
+        if(_trB ^ strategy::A_transpose)
+        {
+            Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer, B, ldb, 0, _Nsize, 0, _Ksize);
+        }
+        else
+        {
+            Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer, B, ldb, 0, _Nsize, 0, _Ksize);
+        }
+
+        _A_pretransposed = A_buffer;
+    }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
new file mode 100644
index 0000000..de11dc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a32_sgemm_8x6(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a53(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a55r1(const float *, const float *, float *, int, int, int);
+
+// 8x6 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_8x6
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 6;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 8;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 8;
+    static const int out_height = 6;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a32_sgemm_8x6;
+
+    sgemm_8x6(const CPUInfo *ci)
+    {
+        switch(ci->get_cpu_model())
+        {
+            case CPUModel::A53:
+                kernel = a32_sgemm_8x6_a53;
+                break;
+
+            case CPUModel::A55r1:
+                kernel = a32_sgemm_8x6_a55r1;
+                break;
+
+            default:
+                kernel = a32_sgemm_8x6;
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
new file mode 100644
index 0000000..428498f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr     = a_ptr0;
+            int tails = (K & 3);
+            if(tails == 0)
+            {
+                tails = 4;
+            }
+            int k = ((K + 3) / 4) - 1;
+
+            __asm __volatile(
+                "vmov.i32    q4, #0\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]\n"
+                "vmov.i32    q5, #0\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]\n"
+                "vmov.i32    q6, #0\n"
+                "ldr        r0, [%[a_ptr], #0x10]\n"
+                "vmov.i32    q7, #0\n"
+                "ldr        r1, [%[a_ptr], #0x14]\n"
+                "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32    q9, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32    q10, #0\n" ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32    q11, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x80]")
+                "vmov.i32    q12, #0\n"
+                "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32    q14, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]")
+                "vmov.i32    q15, #0\n"
+                "cmp        %[k], #0\n"
+                "beq        6f\n"
+
+                "1:\n"
+                // Unroll 0
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmov        d2, r0, r1\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "ldr        r0, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "ldr        r1, [%[b_ptr], #0x1C]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+
+                "vldr        d3, [%[a_ptr], #0x18]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x100]")
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "ldr        r0, [%[b_ptr], #0x28]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "ldr        r1, [%[b_ptr], #0x2C]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+
+                "vldr        d0, [%[a_ptr], #0x20]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "ldr        r0, [%[a_ptr], #0x28]\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "ldr        r1, [%[a_ptr], #0x2C]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+
+                // Unroll 1
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmov        d1, r0, r1\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "ldr        r0, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "ldr        r1, [%[b_ptr], #0x3C]\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+
+                "vldr        d2, [%[a_ptr], #0x30]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x100]")
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "ldr        r0, [%[b_ptr], #0x48]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "ldr        r1, [%[b_ptr], #0x4C]\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+
+                "vldr        d3, [%[a_ptr], #0x38]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "ldr        r0, [%[a_ptr], #0x40]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "ldr        r1, [%[a_ptr], #0x44]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+
+                // Unroll 2
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmov        d0, r0, r1\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "ldr        r0, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "ldr        r1, [%[b_ptr], #0x5C]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+
+                "vldr        d1, [%[a_ptr], #0x48]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "ldr        r0, [%[b_ptr], #0x68]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "ldr        r1, [%[b_ptr], #0x6C]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+
+                "vldr        d2, [%[a_ptr], #0x50]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "ldr        r0, [%[a_ptr], #0x58]\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "ldr        r1, [%[a_ptr], #0x5C]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "add        %[a_ptr], %[a_ptr], #0x60\n"
+
+                // Unroll 3
+                "vldr        d6, [%[b_ptr], #0x70]\n"
+                "vmov        d3, r0, r1\n"
+                "vmla.f32    q4, q2, d1[0]\n"
+                "ldr        r0, [%[b_ptr], #0x78]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "ldr        r1, [%[b_ptr], #0x7C]\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "add        %[b_ptr], %[b_ptr], #0x80\n"
+
+                "vldr        d0, [%[a_ptr], #0x00]\n"
+                "vmov        d7, r0, r1\n"
+                "vmla.f32    q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0xC0]")
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+
+                "vldr        d4, [%[b_ptr], #0x00]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "ldr        r0, [%[b_ptr], #0x08]\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "ldr        r1, [%[b_ptr], #0x0C]\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "subs        %[k], %[k], #1\n"
+
+                "vldr        d1, [%[a_ptr], #0x08]\n"
+                "vmov        d5, r0, r1\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "ldr        r0, [%[a_ptr], #0x10]\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "ldr        r1, [%[a_ptr], #0x14]\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "bne        1b\n"
+
+                // "Tails" shows how many multiply blocks are needed at the
+                // end, must be 1-4 inclusive.  Bail out to alternative tail
+                // immediately if it's 1.
+                "6:\n"
+                "subs        %[tails], %[tails], #1\n"
+                "beq        3f\n"
+
+                // Detached final iteration - for now adapt the generic
+                // tails rather than reimplementing for A53.
+
+                // Unroll 0
+                "vmov        d2, r0, r1\n"
+                "add        %[a_ptr], %[a_ptr], #0x18\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "add        %[b_ptr], %[b_ptr], #0x10\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        4f\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        5f\n"
+
+                // Unroll 2
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==1 final tail
+                "3:\n"
+                "vmov        d2, r0, r1\n"
+                "add        %[b_ptr], %[b_ptr], #0x10\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "add        %[a_ptr], %[a_ptr], #0x18\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==2 final tail
+                "4:\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==3 final tail
+                "5:\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vld1.32    {d0}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+
+                "2:\n"
+                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
new file mode 100644
index 0000000..4cfb72a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    /* Work out starting values for "k" and "tails" in the inner loop. */
+    int tails_initial = (K & 3);
+    if(tails_initial == 0)
+    {
+        tails_initial = 4;
+    }
+
+    int k_initial = ((K + 3) / 4) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            int tails = tails_initial;
+            int k     = k_initial;
+
+            a_ptr = a_ptr0;
+
+            __asm __volatile(
+                "vldr        d0, [%[a_ptr]]\n"
+                "vmov.i32    q4, #0\n"
+                "vldr        d1, [%[a_ptr], #0x08]\n"
+                "vmov.i32    q5, #0\n"
+                "vldr        d4, [%[b_ptr]]\n"
+                "vmov.i32    q6, #0\n"
+                "vldr        d5, [%[b_ptr], #0x08]\n"
+                "vmov.i32    q7, #0\n"
+                "vldr        d2, [%[a_ptr], #0x10]\n"
+                "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32    q9, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32    q10, #0\n" ASM_PREFETCH("[%[b_ptr], #0x80]") "vmov.i32    q11, #0\n"
+                ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32    q12, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32    q14, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #0x100]") "vmov.i32    q15, #0\n" ASM_PREFETCH("[%[a_ptr], #0x100]") "cmp        %[k], #0\n" ASM_PREFETCH("[%[b_ptr], #0x140]") "beq        6f\n"
+                ASM_PREFETCH("[%[b_ptr], #0x180]")
+
+                "1:\n"
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vldr        d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vldr        d3, [%[a_ptr], #0x18]\n"
+                "vmla.f32    q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+                "vmla.f32    q8, q2, d2[0]\n"
+                "subs        %[k], %[k], #1\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vldr        d4, [%[b_ptr], #0x20]\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vldr        d5, [%[b_ptr], #0x28]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x20]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vldr        d1, [%[a_ptr], #0x28]\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vldr        d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vldr        d2, [%[a_ptr], #0x30]\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+
+                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+                "vmla.f32    q8, q2, d1[0]\n"
+
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vldr        d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vldr        d5, [%[b_ptr], #0x48]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vldr        d3, [%[a_ptr], #0x38]\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x40]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+
+                // Unroll 2
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vldr        d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vldr        d1, [%[a_ptr], #0x48]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+
+                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x180]")
+                "vmla.f32    q8, q2, d0[0]\n"
+
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vldr        d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vldr        d5, [%[b_ptr], #0x68]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vldr        d2, [%[a_ptr], #0x50]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vldr        d3, [%[a_ptr], #0x58]\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "add        %[a_ptr], %[a_ptr], #0x60\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vldr        d6, [%[b_ptr], #0x70]\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vldr        d7, [%[b_ptr], #0x78]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "add        %[b_ptr], %[b_ptr], #0x80\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vldr        d0, [%[a_ptr], #0x00]\n"
+                "vmla.f32    q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0x180]")
+                "vmla.f32    q8, q2, d3[0]\n"
+
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vldr        d4, [%[b_ptr], #0x00]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vldr        d5, [%[b_ptr], #0x08]\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vldr        d1, [%[a_ptr], #0x08]\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vldr        d2, [%[a_ptr], #0x10]\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+
+                "vmla.f32    q15, q3, d3[1]\n"
+                "bne        1b\n"
+
+                // "Tails" shows how many multiply blocks are needed at the
+                // end, must be 1-4 inclusive.  Bail out to alternative tail
+                // immediately if it's 1.
+                "6:\n"
+                "subs        %[tails], %[tails], #1\n"
+                "beq        3f\n"
+
+                // Detached final iteration
+
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vldr        d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vldr        d3, [%[a_ptr], #0x18]\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vldr        d4, [%[b_ptr], #0x20]\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vldr        d5, [%[b_ptr], #0x28]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x20]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "add        %[b_ptr], %[b_ptr], #0x30\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vldr        d1, [%[a_ptr], #0x28]\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "beq        4f\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vldr        d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vldr        d2, [%[a_ptr], #0x30]\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+
+                "vmla.f32    q9, q2, d1[1]\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vldr        d4, [%[b_ptr], #0x40]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vldr        d5, [%[b_ptr], #0x48]\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vldr        d3, [%[a_ptr], #0x38]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vldr        d0, [%[a_ptr], #0x40]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "beq        5f\n"
+
+                // Unroll 2
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vldr        d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vldr        d1, [%[a_ptr], #0x48]\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vldr        d4, [%[b_ptr], #0x60]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vldr        d5, [%[b_ptr], #0x68]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vldr        d2, [%[a_ptr], #0x50]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vldr        d3, [%[a_ptr], #0x58]\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vldr        d6, [%[b_ptr], #0x70]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vldr        d7, [%[b_ptr], #0x78]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x60\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x80\n"
+                "b        2f\n"
+
+                // tails==1 final tail
+                "3:\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vldr        d6, [%[b_ptr], #0x10]\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vldr        d7, [%[b_ptr], #0x18]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x18\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x20\n"
+                "b        2f\n"
+
+                // tails==2 final tail
+                "4:\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vldr        d6, [%[b_ptr], #0x30]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vldr        d7, [%[b_ptr], #0x38]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x40\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x30\n"
+                "b        2f\n"
+
+                // tails==3 final tail
+                "5:\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vldr        d6, [%[b_ptr], #0x50]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vldr        d7, [%[b_ptr], #0x58]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "add        %[a_ptr], %[a_ptr], #0x48\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "add        %[b_ptr], %[b_ptr], #0x60\n"
+
+                "2:\n"
+                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
new file mode 100644
index 0000000..d7d0484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr     = a_ptr0;
+            int tails = (K & 3);
+            if(tails == 0)
+            {
+                tails = 4;
+            }
+            int k = ((K + 3) / 4) - 1;
+
+            __asm __volatile(
+                "vmov.i32    q4, #0\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmov.i32    q5, #0\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+                "vmov.i32    q6, #0\n" ASM_PREFETCH("[%[a_ptr], #48]") "vmov.i32    q7, #0\n" ASM_PREFETCH("[%[b_ptr], #48]") "vmov.i32    q8, #0\n" ASM_PREFETCH("[%[a_ptr], #112]") "vmov.i32    q9, #0\n"
+                ASM_PREFETCH("[%[b_ptr], #112]")
+                "vmov.i32    q10, #0\n"
+                "vmov.i32    q11, #0\n"
+                "vmov.i32    q12, #0\n"
+                "vmov.i32    q13, #0\n" ASM_PREFETCH("[%[a_ptr], #176]") "vmov.i32    q14, #0\n" ASM_PREFETCH("[%[b_ptr], #176]")
+                "vmov.i32    q15, #0\n"
+
+                "cmp        %[k], #0\n"
+                "beq        6f\n"
+
+                "1:\n"
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "subs        %[k], %[k], #1\n"
+                "vmla.f32    q5, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #208]")
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32    q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+
+                // Unroll 2
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32    q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #240]")
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vmla.f32    q11, q3, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #208]")
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "bne        1b\n"
+
+                // Branch here if we never execute main loop.
+                "6:\n"
+
+                // "Tails" shows how many multiply blocks are needed at the
+                // end, must be 1-4 inclusive.  Bail out to alternative tail
+                // immediately if it's 1.
+                "subs        %[tails], %[tails], #1\n"
+                "beq        3f\n"
+
+                // Detached final iteration
+                // Unroll 0
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        4f\n"
+
+                // Unroll 1
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "subs        %[tails], %[tails], #1\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "beq        5f\n"
+
+                // Unroll 2
+                "vld1.32    {d0-d1}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vld1.32    {d4-d5}, [%[b_ptr] :128]!\n"
+
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vld1.32    {d2-d3}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+
+                // Unroll 3
+                "vmla.f32    q4, q2, d1[0]\n"
+                "vmla.f32    q10, q3, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d1[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d1[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d2[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d2[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d2[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d2[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d3[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d3[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d3[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d3[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==1 final tail
+                "3:\n"
+                "vmla.f32    q4, q2, d0[0]\n"
+                "vld1.32    {d2}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d0[1]\n"
+                "vld1.32    {d6-d7}, [%[b_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d1[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d0[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d0[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d1[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d1[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d1[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d2[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d2[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d2[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d2[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==2 final tail
+                "4:\n"
+                "vmla.f32    q4, q2, d3[0]\n"
+                "vmla.f32    q10, q3, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q5, q2, d3[1]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d3[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q6, q2, d0[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d0[0]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d0[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d0[1]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d1[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d1[0]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d1[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d1[1]\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+                "b        2f\n"
+
+                // tails==3 final tail
+                "5:\n"
+                "vmla.f32    q4, q2, d2[0]\n"
+                "vld1.32    {d0}, [%[a_ptr] :64]!\n"
+                "vmla.f32    q5, q2, d2[1]\n"
+                "vmla.f32    q6, q2, d3[0]\n"
+                "vst1.32    {d8-d9}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q10, q3, d2[0]\n"
+                "vst1.32    {d20-d21}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q11, q3, d2[1]\n"
+                "vst1.32    {d10-d11}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q12, q3, d3[0]\n"
+                "vst1.32    {d22-d23}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q7, q2, d3[1]\n"
+                "vst1.32    {d12-d13}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q13, q3, d3[1]\n"
+                "vst1.32    {d24-d25}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q8, q2, d0[0]\n"
+                "vst1.32    {d14-d15}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q14, q3, d0[0]\n"
+                "vst1.32    {d26-d27}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q9, q2, d0[1]\n"
+                "vst1.32    {d16-d17}, [%[c_ptr] :128]!\n"
+                "vmla.f32    q15, q3, d0[1]\n"
+                "vst1.32    {d28-d29}, [%[c_ptr] :128]!\n"
+                "vst1.32    {d18-d19}, [%[c_ptr] :128]!\n"
+
+                "2:\n"
+                "vst1.32    {d30-d31}, [%[c_ptr] :128]!\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
new file mode 100644
index 0000000..387f899
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_s16_12x8
+{
+public:
+    typedef int16_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a64_gemm_s16_asimd_12x8;
+
+    gemm_s16_12x8(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
new file mode 100644
index 0000000..b217dcf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const int16_t *a_ptr = Apanel;
+    int32_t       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int16_t *a_ptr0 = a_ptr;
+        const int16_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr            = a_ptr0;
+            const bool odd_k = K & 0x1;
+            int        k     = (K + 1) / 2 - 1;
+
+            register int16x8_t aa asm("v0");
+            register int16x8_t ab asm("v1");
+            register int16x8_t b0 asm("v2");
+            register int16x8_t b1 asm("v3");
+            register int16x8_t b2 asm("v4");
+
+            __asm __volatile(
+                "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+                "movi v5.4s, #0\n"
+                "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+                "movi v6.4s, #0\n"
+                "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+                "ins %[aa].d[1], x20\n"     // Merge A[A].lower and upper
+                "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi v8.4s, #0\n"
+                "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+                "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi v10.4s, #0\n"
+                "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and upper
+                "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+                "movi v12.4s, #0\n"
+                "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+                "movi v14.4s, #0\n"
+                "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi v24.4s, #0\n"
+                "add %x[a_ptr], %x[a_ptr], #0x10\n"
+                "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi v26.4s, #0\n"
+                "add %x[b_ptr], %x[b_ptr], #0x18\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+
+                "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+                "1:\n" // Main loop
+                // First unroll
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+                "ins %[b2].d[1], x20\n"            // Merge B[2].lower and .upper
+                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                // Second unroll
+                "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and .upper
+                "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+                "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "add %x[a_ptr], %x[a_ptr], #0x20\n"
+                "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "subs %x[k], %x[k], #0x1\n"
+                "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+                "ins %[aa].d[1], x20\n"            // Merge A[A].lower and .upper
+                "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "add %x[b_ptr], %x[b_ptr], #0x30\n"
+                "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "bne 1b\n"
+
+                "2:\n" // Even tail
+                "cbnz %x[odd_k], 3f\n"
+
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "add %[a_ptr], %[a_ptr], #0x10\n"
+                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "add %[b_ptr], %[b_ptr], #0x18\n"
+                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+                "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+                "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "b 4f\n" // Complete write out
+
+                "3:\n" // Odd tail
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+
+                "4:\n" // End of function
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "str q27, [%x[c_ptr], #0x140]\n"
+                "str q12, [%x[c_ptr], #0x150]\n"
+                "str q20, [%x[c_ptr], #0x160]\n"
+                "str q28, [%x[c_ptr], #0x170]\n"
+                "add %x[c_ptr], %x[c_ptr], #0x180\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+                [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+                : [odd_k] "r"(odd_k)
+                : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
new file mode 100644
index 0000000..08f90e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+class gemm_s8_12x8
+{
+public:
+    typedef int8_t  operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 8;
+    static const int  A_block      = 4;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 12;
+    static const int  B_block      = 4;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 4;
+
+    kern_type kernel = a64_gemm_s8_12x8;
+
+    gemm_s8_12x8(const CPUInfo *ci)
+    {
+        if(ci->get_cpu_model() == CPUModel::A55r1)
+        {
+            kernel = a64_gemm_s8_12x8_a55r1;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
new file mode 100644
index 0000000..ef2f291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+    const int8_t *a_ptr = Apanel;
+    int32_t      *c_ptr = Cpanel;
+
+    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+    const int W = K / 4;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk    = (W & 1);
+    const int k_iters = ((W + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            int k = k_iters;
+
+            register int32x4_t a0 asm("v0");
+            register int32x4_t a1 asm("v1");
+            register int32x4_t b0 asm("v2");
+            register int32x4_t b1 asm("v3");
+            register int32x4_t b2 asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_SDOT
+#else
+                ".arch armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi   v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi   v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi   v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi   v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v18.4s, #0x0\n"
+                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v20.4s, #0x0\n"
+                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v22.4s, #0x0\n"
+                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v24.4s, #0x0\n"
+                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v26.4s, #0x0\n"
+                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v28.4s, #0x0\n"
+                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v30.4s, #0x0\n"
+                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "sdot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "sdot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "sdot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "b.ne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a.
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "sdot   v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "cbnz   %w[oddk], 2f\n"
+
+                // Even K continuation
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "sdot   v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "sdot   v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "sdot   v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "sdot   v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "sdot   v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "sdot   v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "sdot   v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "sdot   v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot   v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot   v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "sdot   v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot   v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "sdot   v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot   v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "sdot   v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "sdot   v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "sdot   v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "sdot   v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot   v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot   v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "b      3f\n"
+
+                // Odd K continuation
+                "2:\n"
+                "sdot   v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "sdot   v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "sdot   v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "sdot   v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "sdot   v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "sdot   v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "sdot   v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot   v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "sdot   v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot   v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "sdot   v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot   v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "sdot   v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot   v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "sdot   v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot   v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "sdot   v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "sdot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "sdot   v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "sdot   v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,   [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,   [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem sdot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000..c76f99d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_SDOT                                                                                  \
+    ".altmacro\n"                                                                                      \
+    ".macro sdot opd:req, opn:req, opm:req\n"                                                          \
+    "local vd, vn, vm, h, l\n"                                                                         \
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                                               \
+    ".set vd,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                                              \
+    ".set vn,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".irp idx,0,1,2,3\n"                                                                               \
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                                      \
+    ".set vm,\\reg\n"                                                                                  \
+    ".set h,\\idx / 2\n"                                                                               \
+    ".set l,\\idx %% 2\n"                                                                              \
+    ".endif\n"                                                                                         \
+    ".endr\n"                                                                                          \
+    ".endr\n"                                                                                          \
+    ".ifndef vd\n"                                                                                     \
+    ".error \"Bad operand \\opd\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vn\n"                                                                                     \
+    ".error \"Bad operand \\opn\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vm\n"                                                                                     \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef h\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef l\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".int     0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"                      \
+    ".endm\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
new file mode 100644
index 0000000..258ef5e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const int8_t *a_ptr = Apanel;
+    int32_t      *c_ptr = Cpanel;
+    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+    const int W = K / 4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk         = (W & 1);
+    const int init_value_k = ((W + 1) / 2) - 1;
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr  = Bpanel;
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr                = a_ptr0;
+            int                k = init_value_k;
+            register int32x4_t a0 asm("v0");
+            register int32x4_t a1 asm("v1");
+            register int32x4_t b0 asm("v2");
+            register int32x4_t b1 asm("v3");
+            register int32x4_t b2 asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_SDOT
+#else
+                ".arch  armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "sdot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    %q[a0], [%[a_ptr], #64]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #112]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne    1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "sdot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+                "sdot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "sdot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "sdot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+
+                "sdot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "sdot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "sdot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "sdot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "sdot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "sdot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "sdot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "sdot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "sdot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "sdot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "sdot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+
+                "sdot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "sdot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "sdot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "sdot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "sdot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "sdot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "sdot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "sdot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "sdot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "sdot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "sdot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "sdot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "sdot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "sdot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "sdot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "sdot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "sdot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "sdot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem sdot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
new file mode 100644
index 0000000..2ec28f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+#include "arm_gemm.hpp"
+
+class gemm_s8_4x4
+{
+public:
+    typedef int8_t  operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 4;
+    static const int  A_block      = 16;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 4;
+    static const int  B_block      = 16;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 4;
+    static const int out_height = 4;
+    static const int k_unroll   = 16;
+
+    kern_type kernel = a64_gemm_s8_4x4;
+
+    gemm_s8_4x4(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
new file mode 100644
index 0000000..243b94e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const int8_t *a_ptr = Apanel;
+    int32_t      *c_ptr = Cpanel;
+
+    K /= 16;
+    int oddk = (K & 1);
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+
+            int k = ((K + 1) / 2) - 1;
+
+            register int8x16_t b0 asm("v4");
+            register int8x16_t b1 asm("v5");
+            register int8x16_t b2 asm("v6");
+            register int8x16_t b3 asm("v7");
+            register int8x16_t b0a asm("v8");
+            register int8x16_t b1a asm("v9");
+            register int8x16_t b2a asm("v10");
+            register int8x16_t b3a asm("v11");
+
+            __asm __volatile(
+                "movi    v16.4s, #0x0\n"
+                "ldr    q0, [%[a_ptr]]\n"
+                "movi    v17.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v18.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v19.4s, #0x0\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "movi    v20.4s, #0x0\n"
+                "ldr    %q[b3], [%[b_ptr], #48]\n"
+                "movi    v21.4s, #0x0\n"
+                "ldr    q1, [%[a_ptr], #16]\n"
+                "movi    v22.4s, #0x0\n"
+                "ldr    q2, [%[a_ptr], #32]\n"
+                "movi    v23.4s, #0x0\n"
+                "ldr    q3, [%[a_ptr], #48]\n"
+                "movi    v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi    v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]") "movi    v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+                // Loop structure optimized for A57 (after r0).
+
+                // Unavoidably, the multiply will "dribble" if
+                // dual issued with an add.
+
+                // Minimize the effect of this by making sure
+                // there are 2 adds to run under the dribbled
+                // multiply.
+
+                // Pipeline in blocks of 8 multiplies - combine
+                // this iteration's multiplies with adds from
+                // the previous iteration.
+
+                // So the first block doesn't have any adds to
+                // do - but because all the adds are at the
+                // start of the block it's only the first couple
+                // of multiplies that need to be pulled out.
+
+                // Start of unroll 0 (first iteration)
+                "smull    v12.8h, v0.8b, %[b0].8b\n"
+                "smull    v13.8h, v0.8b, %[b1].8b\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Unroll 0 continuation (branch target)
+                "1:\n"
+                "smull    v14.8h, v0.8b, %[b2].8b\n"
+                "subs    %w[k], %w[k], #1\n"
+                "smull    v15.8h, v0.8b, %[b3].8b\n"
+                "ldr    %q[b0a], [%[b_ptr], #64]\n"
+                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
+                "ldr    %q[b1a], [%[b_ptr], #80]\n"
+                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+                "ldr     q0, [%[a_ptr], #64]\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2].8b\n"
+                "ldr    %q[b2a], [%[b_ptr], #96]\n"
+                "smull    v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
+                "ldr    %q[b3a], [%[b_ptr], #112]\n"
+                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
+                "add    %[b_ptr], %[b_ptr], #128\n"
+                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+                "ldr     q1, [%[a_ptr], #80]\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "smull    v14.8h, v2.8b, %[b2].8b\n"
+                "smull    v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0].16b\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2].16b\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+                "ldr     q2, [%[a_ptr], #96]\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "smull    v14.8h, v3.8b, %[b2].8b\n"
+                "smull    v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
+                "ldr     %q[b0], [%[b_ptr], #0]\n"
+                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+                "ldr     q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "sadalp    v28.4s, v12.8h\n"
+                "smull    v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "smull    v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp    v31.4s, v15.8h\n"
+                "smull    v14.8h, v0.8b, %[b2a].8b\n"
+                "smull    v15.8h, v0.8b, %[b3a].8b\n"
+                "ldr     %q[b1], [%[b_ptr], #16]\n"
+                "smlal2    v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1a].16b\n"
+                "ldr     %q[b2], [%[b_ptr], #32]\n"
+                "smlal2    v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3a].16b\n"
+                "ldr     q0, [%[a_ptr], #128]\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "smull    v14.8h, v1.8b, %[b2a].8b\n"
+                "smull    v15.8h, v1.8b, %[b3a].8b\n"
+                "ldr     %q[b3], [%[b_ptr], #48]\n"
+                "smlal2    v12.8h, v1.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v1.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3a].16b\n"
+                "ldr     q1, [%[a_ptr], #16]\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "smull    v14.8h, v2.8b, %[b2a].8b\n"
+                "smull    v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0a].16b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "smlal2    v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2a].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "smlal2    v15.8h, v2.16b, %[b3a].16b\n"
+                "ldr     q2, [%[a_ptr], #32]\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "smull    v14.8h, v3.8b, %[b2a].8b\n"
+                "smull    v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v3.16b, %[b3a].16b\n"
+                "ldr     q3, [%[a_ptr], #48]\n"
+
+                // Start of unroll 0 for next iteration.
+                "sadalp    v28.4s, v12.8h\n"
+                "smull    v12.8h, v0.8b, %[b0].8b\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "smull    v13.8h, v0.8b, %[b1].8b\n"
+                "sadalp    v31.4s, v15.8h\n"
+                "bne    1b\n"
+
+                // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "smull    v14.8h, v0.8b, %[b2].8b\n"
+                "smull    v15.8h, v0.8b, %[b3].8b\n"
+                "ldr    %q[b0a], [%[b_ptr], #64]\n"
+                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
+                "ldr    %q[b1a], [%[b_ptr], #80]\n"
+                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+                "ldr     q0, [%[a_ptr], #64]\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2].8b\n"
+                "ldr    %q[b2a], [%[b_ptr], #96]\n"
+                "smull    v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
+                "ldr    %q[b3a], [%[b_ptr], #112]\n"
+                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
+                "add    %[b_ptr], %[b_ptr], #128\n"
+                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+                "ldr     q1, [%[a_ptr], #80]\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "smull    v14.8h, v2.8b, %[b2].8b\n"
+                "smull    v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+                "ldr     q2, [%[a_ptr], #96]\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "smull    v14.8h, v3.8b, %[b2].8b\n"
+                "smull    v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+                "ldr     q3, [%[a_ptr], #112]\n"
+
+                // Unroll 1
+                "sadalp    v28.4s, v12.8h\n"
+                "smull    v12.8h, v0.8b, %[b0a].8b\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "smull    v13.8h, v0.8b, %[b1a].8b\n"
+                "sadalp    v31.4s, v15.8h\n"
+                "smull    v14.8h, v0.8b, %[b2a].8b\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "smull    v15.8h, v0.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v0.16b, %[b0a].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v0.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3a].16b\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0a].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1a].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2a].8b\n"
+                "smull    v15.8h, v1.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0a].16b\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smlal2    v13.8h, v1.16b, %[b1a].16b\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smlal2    v14.8h, v1.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3a].16b\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0a].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1a].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smull    v14.8h, v2.8b, %[b2a].8b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "smull    v15.8h, v2.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0a].16b\n"
+                "str    q16, [%[c_ptr]]\n"
+                "smlal2    v13.8h, v2.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2a].16b\n"
+                "smlal2    v15.8h, v2.16b, %[b3a].16b\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0a].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1a].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smull    v14.8h, v3.8b, %[b2a].8b\n"
+                "addp    v20.4s, v24.4s, v25.4s\n"
+                "addp    v21.4s, v26.4s, v27.4s\n"
+                "smull    v15.8h, v3.8b, %[b3a].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0a].16b\n"
+                "str    q17, [%[c_ptr], #16]\n"
+                "smlal2    v13.8h, v3.16b, %[b1a].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2a].16b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "smlal2    v15.8h, v3.16b, %[b3a].16b\n"
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "smull    v14.8h, v0.8b, %[b2].8b\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "smull    v15.8h, v0.8b, %[b3].8b\n"
+                "add    %[b_ptr], %[b_ptr], #64\n"
+                "smlal2    v12.8h, v0.16b, %[b0].16b\n"
+                "smlal2    v13.8h, v0.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v0.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v0.16b, %[b3].16b\n"
+
+                "sadalp    v16.4s, v12.8h\n"
+                "smull    v12.8h, v1.8b, %[b0].8b\n"
+                "sadalp    v17.4s, v13.8h\n"
+                "sadalp    v18.4s, v14.8h\n"
+                "smull    v13.8h, v1.8b, %[b1].8b\n"
+                "sadalp    v19.4s, v15.8h\n"
+                "smull    v14.8h, v1.8b, %[b2].8b\n"
+                "smull    v15.8h, v1.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v1.16b, %[b0].16b\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smlal2    v13.8h, v1.16b, %[b1].16b\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smlal2    v14.8h, v1.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v1.16b, %[b3].16b\n"
+
+                "sadalp    v20.4s, v12.8h\n"
+                "smull    v12.8h, v2.8b, %[b0].8b\n"
+                "sadalp    v21.4s, v13.8h\n"
+                "sadalp    v22.4s, v14.8h\n"
+                "smull    v13.8h, v2.8b, %[b1].8b\n"
+                "sadalp    v23.4s, v15.8h\n"
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "smull    v14.8h, v2.8b, %[b2].8b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "smull    v15.8h, v2.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v2.16b, %[b0].16b\n"
+                "str    q16, [%[c_ptr]]\n"
+                "smlal2    v13.8h, v2.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v2.16b, %[b2].16b\n"
+                "smlal2    v15.8h, v2.16b, %[b3].16b\n"
+
+                "sadalp    v24.4s, v12.8h\n"
+                "smull    v12.8h, v3.8b, %[b0].8b\n"
+                "sadalp    v25.4s, v13.8h\n"
+                "sadalp    v26.4s, v14.8h\n"
+                "smull    v13.8h, v3.8b, %[b1].8b\n"
+                "sadalp    v27.4s, v15.8h\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "smull    v14.8h, v3.8b, %[b2].8b\n"
+                "addp    v20.4s, v24.4s, v25.4s\n"
+                "addp    v21.4s, v26.4s, v27.4s\n"
+                "smull    v15.8h, v3.8b, %[b3].8b\n"
+                "smlal2    v12.8h, v3.16b, %[b0].16b\n"
+                "str    q17, [%[c_ptr], #16]\n"
+                "smlal2    v13.8h, v3.16b, %[b1].16b\n"
+                "smlal2    v14.8h, v3.16b, %[b2].16b\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "smlal2    v15.8h, v3.16b, %[b3].16b\n"
+
+                "3:\n"
+
+                // Final additions
+                "sadalp    v28.4s, v12.8h\n"
+                "str    q18, [%[c_ptr], #32]\n"
+                "sadalp    v29.4s, v13.8h\n"
+                "sadalp    v30.4s, v14.8h\n"
+                "sadalp    v31.4s, v15.8h\n"
+
+                // Horizontal reduction, phase 1
+                "addp    v22.4s, v28.4s, v29.4s\n"
+                "addp    v23.4s, v30.4s, v31.4s\n"
+
+                // Horizontal reduction, phase 2
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "str    q19, [%[c_ptr], #48]\n"
+                "add    %[c_ptr], %[c_ptr], #64\n"
+
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+                [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a), [b3a] "+w"(b3a),
+                [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+                "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
new file mode 100644
index 0000000..3975732
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class gemm_u16_12x8
+{
+public:
+    typedef uint16_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a64_gemm_u16_asimd_12x8;
+
+    gemm_u16_12x8(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
new file mode 100644
index 0000000..7903878
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const uint16_t *a_ptr = Apanel;
+    uint32_t       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint16_t *a_ptr0 = a_ptr;
+        const uint16_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr            = a_ptr0;
+            const bool odd_k = K & 0x1;
+            int        k     = (K + 1) / 2 - 1;
+
+            register uint16x8_t aa asm("v0");
+            register uint16x8_t ab asm("v1");
+            register uint16x8_t b0 asm("v2");
+            register uint16x8_t b1 asm("v3");
+            register uint16x8_t b2 asm("v4");
+
+            __asm __volatile(
+                "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+                "movi v5.4s, #0\n"
+                "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+                "movi v6.4s, #0\n"
+                "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+                "ins %[aa].d[1], x20\n"     // Merge A[A].lower and upper
+                "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi v8.4s, #0\n"
+                "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+                "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi v10.4s, #0\n"
+                "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and upper
+                "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+                "movi v12.4s, #0\n"
+                "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+                "movi v14.4s, #0\n"
+                "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi v24.4s, #0\n"
+                "add %x[a_ptr], %x[a_ptr], #0x10\n"
+                "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi v26.4s, #0\n"
+                "add %x[b_ptr], %x[b_ptr], #0x18\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+
+                "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+                "1:\n" // Main loop
+                // First unroll
+                "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+                "ins %[b2].d[1], x20\n"            // Merge B[2].lower and .upper
+                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                // Second unroll
+                "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+                "ins %[b0].d[1], x20\n"            // Merge B[0].lower and .upper
+                "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+                "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "add %x[a_ptr], %x[a_ptr], #0x20\n"
+                "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "subs %x[k], %x[k], #0x1\n"
+                "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+                "ins %[aa].d[1], x20\n"            // Merge A[A].lower and .upper
+                "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "add %x[b_ptr], %x[b_ptr], #0x30\n"
+                "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "bne 1b\n"
+
+                "2:\n" // Even tail
+                "cbnz %x[odd_k], 3f\n"
+
+                "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+                "ins %[b1].d[1], x20\n"     // Merge B[1].lower and .upper
+                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+                "ins %[ab].d[1], x20\n"           // Merge A[B].lower and .upper
+                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "add %[a_ptr], %[a_ptr], #0x10\n"
+                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "add %[b_ptr], %[b_ptr], #0x18\n"
+                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+                "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+                "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+                "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+                "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+                "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+                "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "b 4f\n" // Complete write out
+
+                "3:\n" // Odd tail
+                "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+                "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+                "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+                "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+                "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+                "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+                "str q5, [%x[c_ptr]]\n"
+                "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+                "str q13, [%x[c_ptr], #0x10]\n"
+                "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+                "str q21, [%x[c_ptr], #0x20]\n"
+                "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+                "str q6, [%x[c_ptr], #0x30]\n"
+                "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+                "str q14, [%x[c_ptr], #0x40]\n"
+                "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+                "str q22, [%x[c_ptr], #0x50]\n"
+                "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+                "str q7, [%x[c_ptr], #0x60]\n"
+                "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+                "str q15, [%x[c_ptr], #0x70]\n"
+                "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+                "str q23, [%x[c_ptr], #0x80]\n"
+                "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+                "str q8, [%x[c_ptr], #0x90]\n"
+                "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+                "str q16, [%x[c_ptr], #0xa0]\n"
+                "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+                "str q24, [%x[c_ptr], #0xb0]\n"
+                "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+                "str q9, [%x[c_ptr], #0xc0]\n"
+                "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+                "str q17, [%x[c_ptr], #0xd0]\n"
+                "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+                "str q25, [%x[c_ptr], #0xe0]\n"
+                "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+                "str q10, [%x[c_ptr], #0xf0]\n"
+                "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+                "str q18, [%x[c_ptr], #0x100]\n"
+                "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+                "str q26, [%x[c_ptr], #0x110]\n"
+                "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+                "str q11, [%x[c_ptr], #0x120]\n"
+
+                "4:\n" // End of function
+                "str q19, [%x[c_ptr], #0x130]\n"
+                "str q27, [%x[c_ptr], #0x140]\n"
+                "str q12, [%x[c_ptr], #0x150]\n"
+                "str q20, [%x[c_ptr], #0x160]\n"
+                "str q28, [%x[c_ptr], #0x170]\n"
+                "add %x[c_ptr], %x[c_ptr], #0x180\n"
+                : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+                [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+                : [odd_k] "r"(odd_k)
+                : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
new file mode 100644
index 0000000..26255b1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Load the actual kernel
+void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+class gemm_u8_12x8
+{
+public:
+    typedef uint8_t  operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 8;
+    static const int  A_block      = 4;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 12;
+    static const int  B_block      = 4;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 4;
+
+    kern_type kernel = a64_gemm_u8_12x8;
+
+    gemm_u8_12x8(const CPUInfo *ci)
+    {
+        if(ci->get_cpu_model() == CPUModel::A55r1)
+        {
+            kernel = a64_gemm_u8_12x8_a55r1;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
new file mode 100644
index 0000000..f8fafbd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+    const uint8_t *a_ptr = Apanel;
+    uint32_t      *c_ptr = Cpanel;
+
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K / 4;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk    = (W & 1);
+    const int k_iters = ((W + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            int k = k_iters;
+
+            register int32x4_t a0 asm("v0");
+            register int32x4_t a1 asm("v1");
+            register int32x4_t b0 asm("v2");
+            register int32x4_t b1 asm("v3");
+            register int32x4_t b2 asm("v4");
+            register int32x4_t a0a asm("v5");
+            register int32x4_t a1a asm("v6");
+
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_UDOT
+#else
+                ".arch armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi   v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi   v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi   v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi   v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v18.4s, #0x0\n"
+                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v20.4s, #0x0\n"
+                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v22.4s, #0x0\n"
+                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v24.4s, #0x0\n"
+                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v26.4s, #0x0\n"
+                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v28.4s, #0x0\n"
+                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v30.4s, #0x0\n"
+                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "udot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "udot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "udot   v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "b.ne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a.
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "udot   v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "cbnz   %w[oddk], 2f\n"
+
+                // Even K continuation
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "udot   v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "udot   v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "udot   v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "udot   v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "udot   v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "udot   v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "udot   v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "udot   v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot   v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot   v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "udot   v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot   v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "udot   v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot   v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "udot   v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "udot   v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "udot   v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "udot   v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot   v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot   v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "b      3f\n"
+
+                // Odd K continuation
+                "2:\n"
+                "udot   v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "udot   v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "udot   v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "udot   v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "udot   v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "udot   v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "udot   v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot   v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "udot   v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot   v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "udot   v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot   v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "udot   v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot   v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "udot   v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot   v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "udot   v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "udot   v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "udot   v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "udot   v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,   [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,   [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem udot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000..5ee273b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_UDOT                                                                                  \
+    ".altmacro\n"                                                                                      \
+    ".macro udot opd:req, opn:req, opm:req\n"                                                          \
+    "local vd, vn, vm, h, l\n"                                                                         \
+    ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+    ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                                               \
+    ".set vd,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                                              \
+    ".set vn,\\reg\n"                                                                                  \
+    ".endif\n"                                                                                         \
+    ".irp idx,0,1,2,3\n"                                                                               \
+    ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                                      \
+    ".set vm,\\reg\n"                                                                                  \
+    ".set h,\\idx / 2\n"                                                                               \
+    ".set l,\\idx %% 2\n"                                                                              \
+    ".endif\n"                                                                                         \
+    ".endr\n"                                                                                          \
+    ".endr\n"                                                                                          \
+    ".ifndef vd\n"                                                                                     \
+    ".error \"Bad operand \\opd\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vn\n"                                                                                     \
+    ".error \"Bad operand \\opn\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef vm\n"                                                                                     \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef h\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".ifndef l\n"                                                                                      \
+    ".error \"Bad operand \\opm\"\n"                                                                   \
+    ".exitm\n"                                                                                         \
+    ".endif\n"                                                                                         \
+    ".int     0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"                      \
+    ".endm\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
new file mode 100644
index 0000000..d026dc5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const uint8_t *a_ptr = Apanel;
+    uint32_t      *c_ptr = Cpanel;
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K / 4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk         = (W & 1);
+    const int init_value_k = ((W + 1) / 2) - 1;
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr  = Bpanel;
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr                 = a_ptr0;
+            int                 k = init_value_k;
+            register uint8x16_t a0 asm("v0");
+            register uint8x16_t a1 asm("v1");
+            register uint8x16_t b0 asm("v2");
+            register uint8x16_t b1 asm("v3");
+            register uint8x16_t b2 asm("v4");
+            register uint8x16_t a0a asm("v5");
+            register uint8x16_t a1a asm("v6");
+            __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+                _DECLARE_UDOT
+#else
+                ".arch  armv8.2-a+dotprod\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot      v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot    v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+                "udot    v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "ldr    %q[a0], [%[a_ptr], #64]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #112]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "bne    1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "udot    v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+                "udot    v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "udot   v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "udot    v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+
+                "udot    v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "udot    v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "udot    v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "udot    v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "udot    v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "udot    v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "udot    v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "udot    v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "udot    v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "udot    v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "udot    v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "udot    v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "udot    v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "udot    v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "udot    v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "udot    v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "udot    v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "udot    v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "udot    v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "udot   v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "udot    v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "udot    v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "udot    v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+
+                "udot    v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "udot    v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "udot    v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "udot    v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "udot    v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "udot    v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "udot     v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "udot    v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "udot    v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "udot   v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "udot    v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "udot    v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "udot    v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "udot    v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "udot    v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "udot    v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "udot    v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "udot    v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+                ".purgem udot\n"
+#endif
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
new file mode 100644
index 0000000..5aa5291
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Kernel definition
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
+
+class gemm_u8_4x4
+{
+public:
+    typedef uint8_t  operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int  A_interleave = 4;
+    static const int  A_block      = 16;
+    static const bool A_transpose  = false;
+
+    /* Same for B input */
+    static const int  B_interleave = 4;
+    static const int  B_block      = 16;
+    static const bool B_transpose  = true;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 4;
+    static const int out_height = 4;
+    static const int k_unroll   = 16;
+
+    kern_type kernel = nullptr;
+
+    gemm_u8_4x4(const CPUInfo *ci)
+    {
+        kernel = a64_gemm_u8_4x4;
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
new file mode 100644
index 0000000..0a881ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+    const uint8_t *a_ptr = Apanel;
+    uint32_t      *c_ptr = Cpanel;
+    K /= 16;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+
+            int k = K - 1;
+
+            register uint8x16_t b0 asm("v4");
+            register uint8x16_t b1 asm("v5");
+            register uint8x16_t b2 asm("v6");
+            register uint8x16_t b3 asm("v7");
+
+            __asm __volatile(
+                "movi    v16.4s, #0x0\n"
+                "ldr    q0, [%[a_ptr]]\n"
+                "movi    v17.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v18.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v19.4s, #0x0\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "movi    v20.4s, #0x0\n"
+                "ldr    %q[b3], [%[b_ptr], #48]\n"
+                "movi    v21.4s, #0x0\n"
+                "ldr    q1, [%[a_ptr], #16]\n"
+                "movi    v22.4s, #0x0\n"
+                "ldr    q2, [%[a_ptr], #32]\n"
+                "movi    v23.4s, #0x0\n"
+                "ldr    q3, [%[a_ptr], #48]\n"
+                "movi    v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v27.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi    v30.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]") "movi    v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+                "umull    v12.8h, v0.8b, %[b0].8b\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "umull    v13.8h, v0.8b, %[b1].8b\n"
+                "umull    v14.8h, v0.8b, %[b2].8b\n"
+                "add    %[b_ptr], %[b_ptr], #64\n"
+                "umull    v15.8h, v0.8b, %[b3].8b\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 2f\n"
+
+                "1:\n"
+                "uadalp    v16.4s, v12.8h\n"
+                "umull2    v12.8h, v0.16b, %[b0].16b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull2    v13.8h, v0.16b, %[b1].16b\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull2    v14.8h, v0.16b, %[b2].16b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull2    v15.8h, v0.16b, %[b3].16b\n"
+                "ldr     q0, [%[a_ptr]]\n"
+
+                "uadalp    v16.4s, v12.8h\n"
+                "umull    v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull    v13.8h, v1.8b, %[b1].8b\n"
+                "subs    %w[k], %w[k], #1\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull    v14.8h, v1.8b, %[b2].8b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull    v15.8h, v1.8b, %[b3].8b\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull2    v12.8h, v1.16b, %[b0].16b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull2    v13.8h, v1.16b, %[b1].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "uadalp    v22.4s, v14.8h\n"
+                "umull2    v14.8h, v1.16b, %[b2].16b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull2    v15.8h, v1.16b, %[b3].16b\n"
+                "ldr     q1, [%[a_ptr], #16]\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull    v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull    v13.8h, v2.8b, %[b1].8b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "uadalp    v22.4s, v14.8h\n"
+                "umull    v14.8h, v2.8b, %[b2].8b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull    v15.8h, v2.8b, %[b3].8b\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull2    v12.8h, v2.16b, %[b0].16b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull2    v13.8h, v2.16b, %[b1].16b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull2    v14.8h, v2.16b, %[b2].16b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull2    v15.8h, v2.16b, %[b3].16b\n"
+                "ldr    q2, [%[a_ptr], #32]\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull    v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull    v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull    v14.8h, v3.8b, %[b2].8b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull    v15.8h, v3.8b, %[b3].8b\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "umull2    v12.8h, v3.16b, %[b0].16b\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "umull2    v13.8h, v3.16b, %[b1].16b\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "umull2    v14.8h, v3.16b, %[b2].16b\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "uadalp    v31.4s, v15.8h\n"
+                "umull2    v15.8h, v3.16b, %[b3].16b\n"
+                "ldr    %q[b3], [%[b_ptr], #48]\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "umull    v12.8h, v0.8b, %[b0].8b\n"
+                "add    %[b_ptr], %[b_ptr], #64\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "umull    v13.8h, v0.8b, %[b1].8b\n"
+                "ldr    q3, [%[a_ptr], #48]\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "umull    v14.8h, v0.8b, %[b2].8b\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "uadalp    v31.4s, v15.8h\n"
+                "umull    v15.8h, v0.8b, %[b3].8b\n"
+                "bne    1b\n"
+
+                // Branch target
+                "2:\n"
+                "uadalp    v16.4s, v12.8h\n"
+                "umull2    v12.8h, v0.16b, %[b0].16b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull2    v13.8h, v0.16b, %[b1].16b\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull2    v14.8h, v0.16b, %[b2].16b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull2    v15.8h, v0.16b, %[b3].16b\n"
+
+                "uadalp    v16.4s, v12.8h\n"
+                "umull    v12.8h, v1.8b, %[b0].8b\n"
+                "uadalp    v17.4s, v13.8h\n"
+                "umull    v13.8h, v1.8b, %[b1].8b\n"
+                "uadalp    v18.4s, v14.8h\n"
+                "umull    v14.8h, v1.8b, %[b2].8b\n"
+                "uadalp    v19.4s, v15.8h\n"
+                "umull    v15.8h, v1.8b, %[b3].8b\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull2    v12.8h, v1.16b, %[b0].16b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull2    v13.8h, v1.16b, %[b1].16b\n"
+                "uadalp    v22.4s, v14.8h\n"
+                "umull2    v14.8h, v1.16b, %[b2].16b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull2    v15.8h, v1.16b, %[b3].16b\n"
+
+                "uadalp    v20.4s, v12.8h\n"
+                "umull    v12.8h, v2.8b, %[b0].8b\n"
+                "uadalp    v21.4s, v13.8h\n"
+                "umull    v13.8h, v2.8b, %[b1].8b\n"
+                "uadalp    v22.4s, v14.8h\n"
+                "umull    v14.8h, v2.8b, %[b2].8b\n"
+                "uadalp    v23.4s, v15.8h\n"
+                "umull    v15.8h, v2.8b, %[b3].8b\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull2    v12.8h, v2.16b, %[b0].16b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull2    v13.8h, v2.16b, %[b1].16b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull2    v14.8h, v2.16b, %[b2].16b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull2    v15.8h, v2.16b, %[b3].16b\n"
+
+                "uadalp    v24.4s, v12.8h\n"
+                "umull    v12.8h, v3.8b, %[b0].8b\n"
+                "uadalp    v25.4s, v13.8h\n"
+                "umull    v13.8h, v3.8b, %[b1].8b\n"
+                "uadalp    v26.4s, v14.8h\n"
+                "umull    v14.8h, v3.8b, %[b2].8b\n"
+                "uadalp    v27.4s, v15.8h\n"
+                "umull    v15.8h, v3.8b, %[b3].8b\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "umull2    v12.8h, v3.16b, %[b0].16b\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "umull2    v13.8h, v3.16b, %[b1].16b\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "umull2    v14.8h, v3.16b, %[b2].16b\n"
+                "uadalp    v31.4s, v15.8h\n"
+                "umull2    v15.8h, v3.16b, %[b3].16b\n"
+
+                "uadalp    v28.4s, v12.8h\n"
+                "uadalp    v29.4s, v13.8h\n"
+                "uadalp    v30.4s, v14.8h\n"
+                "uadalp    v31.4s, v15.8h\n"
+
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+                "addp    v20.4s, v24.4s, v25.4s\n"
+                "addp    v21.4s, v26.4s, v27.4s\n"
+                "addp    v22.4s, v28.4s, v29.4s\n"
+                "addp    v23.4s, v30.4s, v31.4s\n"
+
+                "addp    v16.4s, v16.4s, v17.4s\n"
+                "addp    v17.4s, v18.4s, v19.4s\n"
+                "addp    v18.4s, v20.4s, v21.4s\n"
+                "addp    v19.4s, v22.4s, v23.4s\n"
+
+                "str    q16, [%[c_ptr]]\n"
+                "str    q17, [%[c_ptr], #16]\n"
+                "str    q18, [%[c_ptr], #32]\n"
+                "str    q19, [%[c_ptr], #48]\n"
+                "add    %[c_ptr], %[c_ptr], #64\n"
+
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+                [k] "+r"(k)
+                :
+                : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+                "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
new file mode 100644
index 0000000..77ec59a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+// 24x8 HGEMM "strategy" class.  Describes the kernel properties.
+//
+// The generic "gemm_opt" function will instantiate one of these (allowing
+// the constructor to pick a kernel implementation).
+class hgemm_24x8
+{
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+    static const int  A_block      = 1;
+    static const int  A_interleave = 8;
+    static const bool A_transpose  = false;
+
+    static const int  B_block      = 1;
+    static const int  B_interleave = 24;
+    static const bool B_transpose  = true;
+
+    static const int out_width  = 24;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    // Default to the generic kernel
+    kern_type kernel = a64_hgemm_asimd_24x8;
+
+    hgemm_24x8(const CPUInfo *ci)
+    {
+        if(ci->get_cpu_model() == CPUModel::A55r1)
+        {
+            kernel = a64_hgemm_asimd_24x8_a55r1;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
new file mode 100644
index 0000000..d59618d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+    const __fp16 *a_ptr = Apanel;
+    __fp16       *c_ptr = Cpanel;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    int oddk    = (K & 1);
+    int k_iters = ((K + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            int k = k_iters;
+            a_ptr = a_ptr0;
+
+            // As A55 requires 64-bit loads anyway, just use 64 bits of the
+            // "A" operands to save on "ins" instructions.  Since A55 is
+            // in-order, two sets of "A" operands and one set of "B" is
+            // sufficient.
+            register float16x8_t a0 asm("v0");
+            register float16x8_t a1 asm("v1");
+            register float16x8_t a0a asm("v2");
+            register float16x8_t a1a asm("v3");
+            register float16x8_t b0 asm("v4");
+            register float16x8_t b1 asm("v5");
+            register float16x8_t b2 asm("v6");
+
+            __asm __volatile(
+                // Enable FP16 extensions
+                ".arch    armv8.2-a+fp16\n"
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.8h, #0x0\n"
+                "ldr    %d[a0], [%[a_ptr]]\n"
+                "movi    v9.8h, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.8h, #0x0\n"
+                "ldr    %d[a1], [%[a_ptr], #8]\n"
+                "movi    v11.8h, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.8h, #0x0\n"
+                "movi    v13.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi    v14.8h, #0x0\n"
+                "movi    v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi    v16.8h, #0x0\n"
+                "movi    v17.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi    v18.8h, #0x0\n"
+                "movi    v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi    v20.8h, #0x0\n"
+                "movi    v21.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi    v22.8h, #0x0\n"
+                "movi    v23.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi    v24.8h, #0x0\n"
+                "movi    v25.8h, #0x0\n"
+                "movi    v26.8h, #0x0\n"
+                "movi    v27.8h, #0x0\n"
+                "movi    v28.8h, #0x0\n"
+                "movi    v29.8h, #0x0\n"
+                "movi    v30.8h, #0x0\n"
+                "movi    v31.8h, #0x0\n"
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #16]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n"
+                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #24]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n"
+                "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
+                "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                // Unroll 1
+                "fmla     v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0a].h[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #32]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a1a].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1a].h[1]\n"
+                "fmla    v14.8h, %[b0].8h, %[a1a].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1a].h[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #40]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v17.8h, %[b1].8h, %[a0a].h[1]\n"
+                "fmla    v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0a].h[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v20.8h, %[b1].8h, %[a1a].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v22.8h, %[b1].8h, %[a1a].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1a].h[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.8h, %[b2].8h, %[a0a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0a].h[3]\n"
+
+                "fmla    v28.8h, %[b2].8h, %[a1a].h[0]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla    v29.8h, %[b2].8h, %[a1a].h[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v30.8h, %[b2].8h, %[a1a].h[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v31.8h, %[b2].8h, %[a1a].h[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "bne    1b\n"
+
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "cbnz    %w[oddk], 2f\n"
+
+                // Even K continuation
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #16]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #24]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n"
+                "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
+                "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "fmla     v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0a].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+                "fmla     v12.8h, %[b0].8h, %[a1a].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1a].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v14.8h, %[b0].8h, %[a1a].h[2]\n"
+                "fmla    v15.8h, %[b0].8h, %[a1a].h[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #40]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v17.8h, %[b1].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+                "fmla    v20.8h, %[b1].8h, %[a1a].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla    v22.8h, %[b1].8h, %[a1a].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+
+                "fmla    v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla    v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1a].h[0]\n"
+                "fmla    v29.8h, %[b2].8h, %[a1a].h[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v30.8h, %[b2].8h, %[a1a].h[2]\n"
+                "fmla    v31.8h, %[b2].8h, %[a1a].h[3]\n"
+                "b    3f\n"
+
+                "2:\n"
+
+                // Odd tail
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+
+                "fmla     v12.8h, %[b0].8h, %[a1].h[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v14.8h, %[b0].8h, %[a1].h[2]\n"
+                "add    %[a_ptr], %[a_ptr], #16\n"
+                "fmla    v15.8h, %[b0].8h, %[a1].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+                "fmla    v20.8h, %[b1].8h, %[a1].h[0]\n"
+                "fmla    v21.8h, %[b1].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v22.8h, %[b1].8h, %[a1].h[2]\n"
+                "fmla    v23.8h, %[b1].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+                "fmla    v28.8h, %[b2].8h, %[a1].h[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla    v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla    v30.8h, %[b2].8h, %[a1].h[2]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla    v31.8h, %[b2].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+                // Common tail
+                // A55 won't dual issue these stores with anything else, so
+                // simplest to do them all in this common code.
+                "3:\n"
+                "str    q8,  [%[c_ptr]]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "str    q9,  [%[c_ptr], #48]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "5:\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "=w"(a0), [a0a] "=w"(a0a), [a1] "=w"(a1), [a1a] "=w"(a1a),
+                [b0] "=w"(b0), [b1] "=w"(b1), [b2] "=w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
new file mode 100644
index 0000000..468d603
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+    const __fp16 *a_ptr = Apanel;
+    __fp16       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float16x8_t a0 asm("v0");
+            register float16x8_t a0a asm("v1");
+            register float16x8_t b0 asm("v2");
+            register float16x8_t b1 asm("v3");
+            register float16x8_t b2 asm("v4");
+            register float16x8_t b0a asm("v5");
+            register float16x8_t b1a asm("v6");
+            register float16x8_t b2a asm("v7");
+
+            __asm __volatile(
+                ".arch    armv8.2-a+fp16\n"
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.8h, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.8h, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.8h, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v11.8h, #0x0\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "movi    v12.8h, #0x0\n"
+                "ldr    %q[b0a], [%[b_ptr], #48]\n"
+                "movi    v13.8h, #0x0\n"
+                "ldr    %q[b1a], [%[b_ptr], #64]\n"
+                "movi    v14.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v16.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v17.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]") "movi    v18.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi    v20.8h, #0x0\n"
+                "movi    v21.8h, #0x0\n"
+                "movi    v22.8h, #0x0\n"
+                "movi    v23.8h, #0x0\n"
+                "movi    v24.8h, #0x0\n"
+                "movi    v25.8h, #0x0\n"
+                "movi    v26.8h, #0x0\n"
+                "movi    v27.8h, #0x0\n"
+                "movi    v28.8h, #0x0\n"
+                "movi    v29.8h, #0x0\n"
+                "movi    v30.8h, #0x0\n"
+                "movi    v31.8h, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    %q[a0a], [%[a_ptr], #16]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %q[b2a], [%[b_ptr], #80]\n"
+                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla    v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[b_ptr], #288]")
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "ldr    %q[a0], [%[a_ptr], #32]\n"
+
+                "fmla     v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla    v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "fmla    v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "fmla     v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "fmla    v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "fmla    v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "fmla    v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "ldr    %q[b0a], [%[b_ptr], #48]\n"
+
+                "fmla    v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "fmla    v17.8h, %[b1a].8h, %[a0a].h[1]\n" ASM_PREFETCH("[%[b_ptr], #352]")
+                "fmla    v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "fmla    v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "fmla    v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "fmla    v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "fmla    v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "fmla    v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "ldr    %q[b1a], [%[b_ptr], #64]\n"
+
+                "fmla    v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "fmla    v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "fmla    v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "fmla    v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "fmla    v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "fmla    v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+
+                "bne    1b\n"
+                "4:\n"
+
+                // Jump to odd tail if necessary.
+                "cbnz    %w[oddk], 2f\n"
+
+                // Even tail.
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "ldr    %q[a0a], [%[a_ptr], #16]\n"
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr    %q[b2a], [%[b_ptr], #80]\n"
+                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "fmla     v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+                "fmla    v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+                "str    q8, [%[c_ptr]]\n"
+                "fmla    v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+
+                "fmla      v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "fmla    v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "fmla    v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+
+                "fmla    v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+                "fmla    v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+
+                "fmla    v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+                "fmla    v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+
+                "fmla     v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+                "fmla    v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+
+                "fmla      v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+                "fmla    v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+
+                "fmla    v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+                "fmla    v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+
+                "fmla    v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+                "fmla    v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+                "b    3f\n"
+
+                // Odd tail
+                "2:\n"
+                "fmla     v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add    %[a_ptr], %[a_ptr], #16\n"
+                "str    q8, [%[c_ptr]]\n"
+                "fmla    v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+
+                "fmla      v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "fmla    v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "fmla    v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+
+                "fmla    v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+                "fmla    v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+
+                "fmla    v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+                "fmla    v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+
+                "fmla     v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+                "fmla    v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+
+                "fmla      v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+                "fmla    v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+
+                "fmla    v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+                "fmla    v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+
+                "fmla    v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+                "fmla    v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a0a] "+w"(a0a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k),
+                [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
new file mode 100644
index 0000000..91a9e8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_12x8
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block      = 1;
+    static const int A_transpose  = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block      = 1;
+    static const int B_transpose  = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 12;
+    static const int out_height = 8;
+    static const int k_unroll   = 1;
+
+    kern_type kernel = a64_sgemm_asimd_12x8;
+
+    sgemm_12x8(const CPUInfo *ci)
+    {
+        // Select specific kernel if available
+        switch(ci->get_cpu_model())
+        {
+            case CPUModel::A53:
+                kernel = a64_sgemm_asimd_12x8_a53;
+                break;
+
+            case CPUModel::A55r0:
+                kernel = a64_sgemm_asimd_12x8_a55;
+                break;
+
+            case CPUModel::A55r1:
+                kernel = a64_sgemm_asimd_12x8_a55r1;
+                break;
+
+            default:
+                /* Generic kernel is initialized by default. */
+                break;
+        }
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
new file mode 100644
index 0000000..618ebc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "nop\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "ins    %[a0].d[1], x20\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "ins    %[a1].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+                "nop\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+                "nop\n"
+                "nop\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "nop\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+
+                "nop\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+                "bne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration. (even K)
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                "nop\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b    3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,  [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,  [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
new file mode 100644
index 0000000..4ca25eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "subs    %w[k], %w[k], #1\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "ins    %[b0].d[1], x20\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ins    %[b1].d[1], x20\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+                "ins    %[a0].d[1], x20\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+                "ins    %[a1].d[1], x20\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+                "ins    %[b0].d[1], x20\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "ins    %[b1].d[1], x20\n"
+
+                "bne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz    %w[oddk], 2f\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                // Detached final iteration. (even K)
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+                "ins    %[a0a].d[1], x20\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+                "ins    %[a1a].d[1], x20\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+                "ins    %[b0].d[1], x20\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+                "ins    %[b1].d[1], x20\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b    3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,  [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,  [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
new file mode 100644
index 0000000..89fe6ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    int oddk    = (K & 1);
+    int k_iters = ((K + 1) / 2) - 1;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            int k = k_iters;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi   v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi   v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi   v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi   v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi   v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi   v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi   v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi   v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi   v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi   v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi   v18.4s, #0x0\n"
+                "movi   v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi   v20.4s, #0x0\n"
+                "movi   v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi   v22.4s, #0x0\n"
+                "movi   v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi   v24.4s, #0x0\n"
+                "movi   v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi   v26.4s, #0x0\n"
+                "movi   v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "movi   v28.4s, #0x0\n"
+                "movi   v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                "movi   v30.4s, #0x0\n"
+                "movi   v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+                // The loop is offset by these two instructions which must
+                // always be executed.
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr    %d[a0], [%[a_ptr], #64]\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #72]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr    %d[a1], [%[a_ptr], #80]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "ins    %[a0].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[a_ptr], #88]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "ins    %[a1].d[1], x20\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #104]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #112]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #120]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #32]\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "b.ne    1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+
+                // Start final iteration - branch off to "odd" code before we load a0a.
+                "fmla    v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[b_ptr], #40]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "cbnz    %w[oddk], 2f\n"
+
+                // Even K continuation
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %d[a0a], [%[a_ptr], #32]\n"
+
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[a_ptr], #40]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %d[a1a], [%[a_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ins    %[a0a].d[1], x20\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr    x20, [%[a_ptr], #56]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "ldr    %d[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "ins    %[a1a].d[1], x20\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #56]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr    %d[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "ins    %[b0].d[1], x20\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "ldr    x20, [%[b_ptr], #72]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr    %d[b2], [%[b_ptr], #80]\n"
+
+                "fmla    v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ins    %[b1].d[1], x20\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    x20, [%[b_ptr], #88]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "ins    %[b2].d[1], x20\n"
+
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla    v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b    3f\n"
+
+                // Odd K continuation
+                "2:\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+                "fmla    v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "ins    %[b2].d[1], x20\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q8,   [%[c_ptr]]\n"
+                "str    q16,  [%[c_ptr], #16]\n"
+                "str    q24,  [%[c_ptr], #32]\n"
+                "str    q9,   [%[c_ptr], #48]\n"
+                "str    q17,  [%[c_ptr], #64]\n"
+                "str    q25,  [%[c_ptr], #80]\n"
+                "str    q10,  [%[c_ptr], #96]\n"
+                "str    q18,  [%[c_ptr], #112]\n"
+                "str    q26,  [%[c_ptr], #128]\n"
+                "str    q11,  [%[c_ptr], #144]\n"
+                "str    q19,  [%[c_ptr], #160]\n"
+                "str    q27,  [%[c_ptr], #176]\n"
+                "str    q12,  [%[c_ptr], #192]\n"
+                "str    q20,  [%[c_ptr], #208]\n"
+                "str    q28,  [%[c_ptr], #224]\n"
+                "str    q13,  [%[c_ptr], #240]\n"
+                "str    q21,  [%[c_ptr], #256]\n"
+                "str    q29,  [%[c_ptr], #272]\n"
+                "str    q14,  [%[c_ptr], #288]\n"
+                "str    q22,  [%[c_ptr], #304]\n"
+                "str    q30,  [%[c_ptr], #320]\n"
+                "str    q15,  [%[c_ptr], #336]\n"
+                "str    q23,  [%[c_ptr], #352]\n"
+                "str    q31,  [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
new file mode 100644
index 0000000..42e870e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump = 0, long int block_jump = 0)
+{
+    const float *a_ptr = Apanel;
+    float       *c_ptr = Cpanel;
+
+    for(int yb = 0; yb < ablocks; yb++)
+    {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr  = Bpanel;
+
+        for(int xb = 0; xb < bblocks; xb++)
+        {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k    = ((K + 1) / 2) - 1;
+
+            register float32x4_t a0 asm("v0");
+            register float32x4_t a1 asm("v1");
+            register float32x4_t b0 asm("v2");
+            register float32x4_t b1 asm("v3");
+            register float32x4_t b2 asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile(
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi    v8.4s, #0x0\n"
+                "ldr    %q[a0], [%[a_ptr]]\n"
+                "movi    v9.4s, #0x0\n"
+                "ldr    %q[b0], [%[b_ptr]]\n"
+                "movi    v10.4s, #0x0\n"
+                "ldr    %q[a1], [%[a_ptr], #16]\n"
+                "movi    v11.4s, #0x0\n"
+                "ldr    %q[b1], [%[b_ptr], #16]\n"
+                "movi    v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi    v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi    v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi    v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]") "movi    v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi    v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]") "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi    v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi    v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi    v22.4s, #0x0\n"
+                "movi    v23.4s, #0x0\n"
+                "movi    v24.4s, #0x0\n"
+                "movi    v25.4s, #0x0\n"
+                "movi    v26.4s, #0x0\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz    %w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla      v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla    v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "fmla     v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla    v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr    %q[a0], [%[a_ptr], #64]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla     v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ldr    %q[a1], [%[a_ptr], #80]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #96]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #112]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "bne    1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz    %w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr    %q[a0a], [%[a_ptr], #32]\n"
+                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr    %q[a1a], [%[a_ptr], #48]\n"
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr    %q[b0], [%[b_ptr], #48]\n"
+
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr    %q[b1], [%[b_ptr], #64]\n"
+
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #64\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr    %q[b2], [%[b_ptr], #80]\n"
+
+                "fmla     v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[block_jump]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #96\n"
+                "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "str    q24, [%[c_ptr], #32]\n"
+
+                "fmla    v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+                "fmla    v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "fmla    v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "fmla     v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "fmla    v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "fmla    v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                "b    3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "fmla     v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr    %q[b2], [%[b_ptr], #32]\n"
+                "fmla    v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "str    q8, [%[c_ptr], #0]\n"
+                "fmla    v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str    q16, [%[c_ptr], #16]\n"
+                "fmla    v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "add    %[b_ptr], %[b_ptr], #48\n"
+                "add    %[a_ptr], %[a_ptr], #32\n"
+                "str    q24, [%[c_ptr], #32]\n"
+                "fmla    v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str    q9, [%[c_ptr], #48]\n"
+
+                "fmla    v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str    q17, [%[c_ptr], #64]\n"
+                "fmla    v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str    q25, [%[c_ptr], #80]\n"
+                "fmla    v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str    q10, [%[c_ptr], #96]\n"
+
+                "fmla    v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str    q18, [%[c_ptr], #112]\n"
+                "fmla    v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str    q26, [%[c_ptr], #128]\n"
+                "fmla    v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str    q11, [%[c_ptr], #144]\n"
+
+                "fmla     v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str    q19, [%[c_ptr], #160]\n"
+                "fmla    v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str    q27, [%[c_ptr], #176]\n"
+                "fmla    v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str    q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "str    q20, [%[c_ptr], #208]\n"
+                "fmla    v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str    q28, [%[c_ptr], #224]\n"
+                "fmla    v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str    q13, [%[c_ptr], #240]\n"
+
+                "fmla    v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str    q21, [%[c_ptr], #256]\n"
+                "fmla    v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str    q29, [%[c_ptr], #272]\n"
+                "fmla    v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str    q14, [%[c_ptr], #288]\n"
+
+                "fmla    v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str    q22, [%[c_ptr], #304]\n"
+                "fmla    v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str    q30, [%[c_ptr], #320]\n"
+                "fmla    v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str    q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str    q23, [%[c_ptr], #352]\n"
+                "str    q31, [%[c_ptr], #368]\n"
+                "add    %[c_ptr], %[c_ptr], #384\n"
+                :
+                [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+                [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+                [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+                : [oddk] "r"(oddk), [row_jump] "r"(row_jump), [block_jump] "r"(block_jump)
+                : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+        }
+    }
+}
+
+void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+    a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
+
+} // namespace arm_gemm
+
+#endif
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
new file mode 100644
index 0000000..eceacc9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_native_16x4(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_native_16x4
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static const int out_width  = 16;
+    static const int out_height = 4;
+    static const int k_unroll   = 1;
+
+    // Default to the generic kernel
+    kern_type kernel = a64_sgemm_native_16x4;
+
+    sgemm_native_16x4(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
new file mode 100644
index 0000000..1b5787c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+namespace arm_gemm
+{
+void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K)
+{
+    int oddk  = (K % 8) ? 1 : 0;
+    int beta0 = (beta == 0.0f) ? 1 : 0;
+
+    /* For now, very naive with no blocking */
+    for(int y = 0; y < M; y += 4)
+    {
+        for(int x0 = 0; x0 < N; x0 += 16)
+        {
+            const float *a_ptr0 = A + (y * lda);
+            const float *a_ptr1 = a_ptr0 + lda;
+            const float *a_ptr2 = a_ptr1 + lda;
+            const float *a_ptr3 = a_ptr2 + lda;
+
+            const float *b_ptr = B + x0;
+
+            float *c_ptr0 = C + (y * ldc) + x0;
+            float *c_ptr1 = c_ptr0 + ldc;
+            float *c_ptr2 = c_ptr1 + ldc;
+            float *c_ptr3 = c_ptr2 + ldc;
+
+            int loops = ((K + 4) / 8) - 1;
+
+            size_t ldbb = ldb * sizeof(float);
+
+            __asm __volatile(
+                "a0   .req v0\n"
+                "a1   .req v1\n"
+                "a2   .req v2\n"
+                "a3   .req v3\n"
+                "a0a  .req v4\n"
+                "a1a  .req v5\n"
+                "a2a  .req v6\n"
+                "a3a  .req v7\n"
+                "bb0  .req v8\n"
+                "bb1  .req v9\n"
+                "bb2  .req v10\n"
+                "bb3  .req v11\n"
+                "b0a  .req v12\n"
+                "b1a  .req v13\n"
+                "b2a  .req v14\n"
+                "b3a  .req v15\n"
+
+                "a0q  .req q0\n"
+                "a1q  .req q1\n"
+                "a2q  .req q2\n"
+                "a3q  .req q3\n"
+                "a0aq .req q4\n"
+                "a1aq .req q5\n"
+                "a2aq .req q6\n"
+                "a3aq .req q7\n"
+                "b0q  .req q8\n"
+                "b1q  .req q9\n"
+                "b2q  .req q10\n"
+                "b3q  .req q11\n"
+                "b0aq .req q12\n"
+                "b1aq .req q13\n"
+                "b2aq .req q14\n"
+                "b3aq .req q15\n"
+
+                "movi    v16.4s, #0x0\n"
+                "ldr    a0q, [%[a_ptr0]]\n"
+                "movi    v17.4s, #0x0\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+                "movi    v18.4s, #0x0\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+                "movi    v19.4s, #0x0\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+                "movi    v20.4s, #0x0\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+                "movi    v21.4s, #0x0\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "ldr    a1q, [%[a_ptr1]]\n"
+                "movi    v22.4s, #0x0\n"
+                "ldr    a2q, [%[a_ptr2]]\n"
+                "movi    v23.4s, #0x0\n"
+                "ldr    a3q, [%[a_ptr3]]\n"
+                "movi    v24.4s, #0x0\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+                "movi    v25.4s, #0x0\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+                "movi    v26.4s, #0x0\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+                "cbz    %w[beta0], 5f\n"
+                "movi    v27.4s, #0x0\n"
+                "movi    v28.4s, #0x0\n"
+                "movi    v29.4s, #0x0\n"
+                "movi    v30.4s, #0x0\n"
+                "movi    v31.4s, #0x0\n"
+
+                // Skip if no complete loops.
+                "cbz    %w[loops], 4f\n"
+                "b    1f\n"
+
+                // If beta is non-zero, need to load and multiply by beta
+                "5:\n"
+                "ld1r    {v4.4s}, [%[betaptr]]\n"
+                "ldr    q16, [%[c_ptr0]]\n"
+                "ldr    q17, [%[c_ptr0], #16]\n"
+                "ldr    q18, [%[c_ptr0], #32]\n"
+                "ldr    q19, [%[c_ptr0], #48]\n"
+
+                "ldr    q20, [%[c_ptr1]]\n"
+                "fmul    v16.4s, v16.4s, v4.4s\n"
+                "ldr    q21, [%[c_ptr1], #16]\n"
+                "fmul    v17.4s, v17.4s, v4.4s\n"
+                "ldr    q22, [%[c_ptr1], #32]\n"
+                "fmul    v18.4s, v18.4s, v4.4s\n"
+                "ldr    q23, [%[c_ptr1], #48]\n"
+                "fmul    v19.4s, v19.4s, v4.4s\n"
+
+                "ldr    q24, [%[c_ptr2]]\n"
+                "fmul    v20.4s, v20.4s, v4.4s\n"
+                "ldr    q25, [%[c_ptr2], #16]\n"
+                "fmul    v21.4s, v21.4s, v4.4s\n"
+                "ldr    q26, [%[c_ptr2], #32]\n"
+                "fmul    v22.4s, v22.4s, v4.4s\n"
+                "ldr    q27, [%[c_ptr2], #48]\n"
+                "fmul    v23.4s, v23.4s, v4.4s\n"
+
+                "ldr    q28, [%[c_ptr3]]\n"
+                "fmul    v24.4s, v24.4s, v4.4s\n"
+                "ldr    q29, [%[c_ptr3], #16]\n"
+                "fmul    v25.4s, v25.4s, v4.4s\n"
+                "ldr    q30, [%[c_ptr3], #32]\n"
+                "fmul    v26.4s, v26.4s, v4.4s\n"
+                "ldr    q31, [%[c_ptr3], #48]\n"
+                "fmul    v27.4s, v27.4s, v4.4s\n"
+
+                "fmul    v28.4s, v28.4s, v4.4s\n"
+                "fmul    v29.4s, v29.4s, v4.4s\n"
+                "fmul    v30.4s, v30.4s, v4.4s\n"
+                "fmul    v31.4s, v31.4s, v4.4s\n"
+
+                "cbz    %w[loops], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla    v16.4s, bb0.4s, a0.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[0]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[0]\n"
+                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
+                "ldr    a0aq, [%[a_ptr0], #16]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
+                "ldr    a1aq, [%[a_ptr1], #16]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
+                "ldr    a2aq, [%[a_ptr2], #16]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 1
+                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
+                "ldr    a3aq, [%[a_ptr3], #16]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
+                "subs    %w[loops], %w[loops], #1\n"
+                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 2
+                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
+                "add    %[a_ptr0], %[a_ptr0], #32\n"
+                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
+                "add    %[a_ptr1], %[a_ptr1], #32\n"
+                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
+                "add    %[a_ptr2], %[a_ptr2], #32\n"
+                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
+                "add    %[a_ptr3], %[a_ptr3], #32\n"
+                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 3
+                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "ldr    a0q, [%[a_ptr0]]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 4
+                "fmla    v16.4s, bb0.4s, a0a.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[0]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[0]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[0]\n"
+                "ldr    a1q, [%[a_ptr1]]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[0]\n"
+                "ldr    a2q, [%[a_ptr2]]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[0]\n"
+                "ldr    a3q, [%[a_ptr3]]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 5
+                "fmla    v16.4s, b0a.4s, a0a.s[1]\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2a.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3a.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0a.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[1]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0a.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0a.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 6
+                "fmla    v16.4s, bb0.4s, a0a.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[2]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[2]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[2]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[2]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 7
+                "fmla    v16.4s, b0a.4s, a0a.s[3]\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
+                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0a.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0a.s[3]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0a.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
+                "bne    1b\n"
+
+                // Skip to here
+                "4:\n"
+
+                // Detached final iteration
+                // Unroll 0
+                "fmla    v16.4s, bb0.4s, a0.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[0]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[0]\n"
+                "cbnz    %w[oddk], 2f\n" // Deal with odd K before we load a0a
+                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
+                "ldr    a0aq, [%[a_ptr0], #16]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
+                "ldr    a1aq, [%[a_ptr1], #16]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
+                "ldr    a2aq, [%[a_ptr2], #16]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 1
+                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
+                "ldr    a3aq, [%[a_ptr3], #16]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
+                "subs    %w[loops], %w[loops], #1\n"
+                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 2
+                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 3
+                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "ldr    a3aq, [%[a_ptr3], #16]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 4
+                "fmla    v16.4s, bb0.4s, a0a.s[0]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[0]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[0]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[0]\n"
+                "ldr    b0q, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[0]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[0]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[0]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[0]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 5
+                "fmla    v16.4s, b0a.4s, a0a.s[1]\n"
+                "fmla    v20.4s, b0a.4s, a1a.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v24.4s, b0a.4s, a2a.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3a.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0a.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[1]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0a.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0a.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 6
+                "fmla    v16.4s, bb0.4s, a0a.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1a.s[2]\n"
+                "fmla    v24.4s, bb0.4s, a2a.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3a.s[2]\n"
+
+                "fmla    v17.4s, bb1.4s, a0a.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1a.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2a.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3a.s[2]\n"
+
+                "fmla    v18.4s, bb2.4s, a0a.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1a.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2a.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3a.s[2]\n"
+
+                "fmla    v19.4s, bb3.4s, a0a.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1a.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2a.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3a.s[2]\n"
+
+                // Unroll 7
+                "fmla    v16.4s, b0a.4s, a0a.s[3]\n"
+                "fmla    v17.4s, b1a.4s, a0a.s[3]\n"
+                "fmla    v18.4s, b2a.4s, a0a.s[3]\n"
+                "fmla    v19.4s, b3a.4s, a0a.s[3]\n"
+
+                "fmla    v20.4s, b0a.4s, a1a.s[3]\n"
+                "str    q16, [%[c_ptr0]]\n"
+                "fmla    v21.4s, b1a.4s, a1a.s[3]\n"
+                "str    q17, [%[c_ptr0], #16]\n"
+                "fmla    v22.4s, b2a.4s, a1a.s[3]\n"
+                "str    q18, [%[c_ptr0], #32]\n"
+                "fmla    v23.4s, b3a.4s, a1a.s[3]\n"
+                "str    q19, [%[c_ptr0], #48]\n"
+
+                "fmla    v24.4s, b0a.4s, a2a.s[3]\n"
+                "str    q20, [%[c_ptr1]]\n"
+                "fmla    v25.4s, b1a.4s, a2a.s[3]\n"
+                "str    q21, [%[c_ptr1], #16]\n"
+                "fmla    v26.4s, b2a.4s, a2a.s[3]\n"
+                "str    q22, [%[c_ptr1], #32]\n"
+                "fmla    v27.4s, b3a.4s, a2a.s[3]\n"
+                "str    q23, [%[c_ptr1], #48]\n"
+
+                "fmla    v28.4s, b0a.4s, a3a.s[3]\n"
+                "str    q24, [%[c_ptr2]]\n"
+                "fmla    v29.4s, b1a.4s, a3a.s[3]\n"
+                "str    q25, [%[c_ptr2], #16]\n"
+                "fmla    v30.4s, b2a.4s, a3a.s[3]\n"
+                "str    q26, [%[c_ptr2], #32]\n"
+                "fmla    v31.4s, b3a.4s, a3a.s[3]\n"
+                "str    q27, [%[c_ptr2], #48]\n"
+                "b    3f\n"
+
+                // Odd K case: Just do 4 more.
+                "2:\n"
+                "fmla    v21.4s, bb1.4s, a1.s[0]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[0]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[0]\n"
+                "ldr    b1q, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[0]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[0]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[0]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[0]\n"
+                "ldr    b2q, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[0]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[0]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[0]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[0]\n"
+                "ldr    b3q, [%[b_ptr], #48]\n"
+
+                // Unroll 1
+                "fmla    v16.4s, b0a.4s, a0.s[1]\n"
+                "add    %[b_ptr], %[b_ptr], %[ldb]\n"
+                "fmla    v20.4s, b0a.4s, a1.s[1]\n"
+                "fmla    v24.4s, b0a.4s, a2.s[1]\n"
+                "fmla    v28.4s, b0a.4s, a3.s[1]\n"
+                "ldr    b0aq, [%[b_ptr]]\n"
+
+                "fmla    v17.4s, b1a.4s, a0.s[1]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[1]\n"
+                "subs    %w[loops], %w[loops], #1\n"
+                "fmla    v25.4s, b1a.4s, a2.s[1]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[1]\n"
+                "ldr    b1aq, [%[b_ptr], #16]\n"
+
+                "fmla    v18.4s, b2a.4s, a0.s[1]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[1]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[1]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[1]\n"
+                "ldr    b2aq, [%[b_ptr], #32]\n"
+
+                "fmla    v19.4s, b3a.4s, a0.s[1]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[1]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[1]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[1]\n"
+                "ldr    b3aq, [%[b_ptr], #48]\n"
+
+                // Unroll 2
+                "fmla    v16.4s, bb0.4s, a0.s[2]\n"
+                "fmla    v20.4s, bb0.4s, a1.s[2]\n"
+                "fmla    v24.4s, bb0.4s, a2.s[2]\n"
+                "fmla    v28.4s, bb0.4s, a3.s[2]\n"
+
+                "fmla    v17.4s, bb1.4s, a0.s[2]\n"
+                "fmla    v21.4s, bb1.4s, a1.s[2]\n"
+                "fmla    v25.4s, bb1.4s, a2.s[2]\n"
+                "fmla    v29.4s, bb1.4s, a3.s[2]\n"
+
+                "fmla    v18.4s, bb2.4s, a0.s[2]\n"
+                "fmla    v22.4s, bb2.4s, a1.s[2]\n"
+                "fmla    v26.4s, bb2.4s, a2.s[2]\n"
+                "fmla    v30.4s, bb2.4s, a3.s[2]\n"
+
+                "fmla    v19.4s, bb3.4s, a0.s[2]\n"
+                "fmla    v23.4s, bb3.4s, a1.s[2]\n"
+                "fmla    v27.4s, bb3.4s, a2.s[2]\n"
+                "fmla    v31.4s, bb3.4s, a3.s[2]\n"
+
+                // Unroll 3
+                "fmla    v16.4s, b0a.4s, a0.s[3]\n"
+                "fmla    v17.4s, b1a.4s, a0.s[3]\n"
+                "fmla    v18.4s, b2a.4s, a0.s[3]\n"
+                "fmla    v19.4s, b3a.4s, a0.s[3]\n"
+
+                "fmla    v20.4s, b0a.4s, a1.s[3]\n"
+                "str    q16, [%[c_ptr0]]\n"
+                "fmla    v21.4s, b1a.4s, a1.s[3]\n"
+                "str    q17, [%[c_ptr0], #16]\n"
+                "fmla    v22.4s, b2a.4s, a1.s[3]\n"
+                "str    q18, [%[c_ptr0], #32]\n"
+                "fmla    v23.4s, b3a.4s, a1.s[3]\n"
+                "str    q19, [%[c_ptr0], #48]\n"
+
+                "fmla    v24.4s, b0a.4s, a2.s[3]\n"
+                "str    q20, [%[c_ptr1]]\n"
+                "fmla    v25.4s, b1a.4s, a2.s[3]\n"
+                "str    q21, [%[c_ptr1], #16]\n"
+                "fmla    v26.4s, b2a.4s, a2.s[3]\n"
+                "str    q22, [%[c_ptr1], #32]\n"
+                "fmla    v27.4s, b3a.4s, a2.s[3]\n"
+                "str    q23, [%[c_ptr1], #48]\n"
+
+                "fmla    v28.4s, b0a.4s, a3.s[3]\n"
+                "str    q24, [%[c_ptr2]]\n"
+                "fmla    v29.4s, b1a.4s, a3.s[3]\n"
+                "str    q25, [%[c_ptr2], #16]\n"
+                "fmla    v30.4s, b2a.4s, a3.s[3]\n"
+                "str    q26, [%[c_ptr2], #32]\n"
+                "fmla    v31.4s, b3a.4s, a3.s[3]\n"
+                "str    q27, [%[c_ptr2], #48]\n"
+
+                "3:\n"
+                "str    q28, [%[c_ptr3]]\n"
+                "str    q29, [%[c_ptr3], #16]\n"
+                "str    q30, [%[c_ptr3], #32]\n"
+                "str    q31, [%[c_ptr3], #48]\n"
+
+                : [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3),
+                [b_ptr] "+r"(b_ptr), [loops] "+r"(loops)
+                : [ldb] "r"(ldbb), [oddk] "r"(oddk), [beta0] "r"(beta0), [betaptr] "r"(&beta),
+                [c_ptr0] "r"(c_ptr0), [c_ptr1] "r"(c_ptr1), [c_ptr2] "r"(c_ptr2), [c_ptr3] "r"(c_ptr3)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+                "cc", "memory");
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
new file mode 100644
index 0000000..c89514f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemv_pretransposed(const float *, int, const float *, float *, float, int, int);
+
+// Pretransposed SGEMV strategy class.
+class sgemv_pretransposed
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, int, const float *, float *, float, int, int);
+
+    /* Describes the data layout for matrix (A) input */
+
+    /* Note that often GEMV is expressed as a GEMM with M=1, i.e.  A is the
+     * (row) vector and B is the matrix, but the standard GEMV arrangement
+     * is matrix A times (column) vector X.  "A_transpose" is expressed in
+     * terms of this standard arrangement, so if the A matrix is in fact the
+     * B matrix from a GEMM call, the sense of the transpose needs to be
+     * reversed.  */
+    static const int  A_interleave = 32;
+    static const int  A_block      = 1;
+    static const bool A_transpose  = false;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 32;
+    static const int k_unroll  = 1;
+
+    kern_type kernel = a64_sgemv_pretransposed;
+
+    sgemv_pretransposed(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
new file mode 100644
index 0000000..2907598
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N)
+{
+    const bool beta0 = (beta == 0.0f);
+    const bool beta1 = (beta == 1.0f);
+
+    for(int x = 0; x < N; x += 32)
+    {
+        float *y_ptr = Y + x;
+
+        // How many elements are we processing in this loop?
+        int l = std::min(N - x, 32);
+
+        register float32x4_t r0 asm("v24");
+        register float32x4_t r1 asm("v25");
+        register float32x4_t r2 asm("v26");
+        register float32x4_t r3 asm("v27");
+        register float32x4_t r4 asm("v28");
+        register float32x4_t r5 asm("v29");
+        register float32x4_t r6 asm("v30");
+        register float32x4_t r7 asm("v31");
+
+        register float32x4_t x0 asm("v0");
+        register float32x4_t x0a asm("v1");
+
+        const float *x_ptr = X;
+        const float *a_ptr = A + ((x / 32) * lda);
+
+        if(beta0)
+        {
+            r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = vdupq_n_f32(0.0f);
+        }
+        else
+        {
+            if(l == 32)
+            {
+                // Fastest path - load all 8 vectors
+                r0 = vld1q_f32(y_ptr);
+                r1 = vld1q_f32(y_ptr + 4);
+                r2 = vld1q_f32(y_ptr + 8);
+                r3 = vld1q_f32(y_ptr + 12);
+                r4 = vld1q_f32(y_ptr + 16);
+                r5 = vld1q_f32(y_ptr + 20);
+                r6 = vld1q_f32(y_ptr + 24);
+                r7 = vld1q_f32(y_ptr + 28);
+            }
+            else
+            {
+                // Slow case - leftovers.  Note that we don't care about
+                // out-of-range vectors and lanes as we will throw them away at
+                // the end.
+                int vecs    = l / 4; // How many leftover vectors?
+                int oddbits = l % 4; // And how many odd single values?
+
+                if(oddbits)
+                {
+                    // Load the outstanding odd values into a vector first
+                    float32x4_t oddvec  = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that.
+                    float      *oddbase = y_ptr + l - oddbits;
+
+                    switch(oddbits)
+                    {
+                        case 3:
+                            oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2);
+                        // fall through
+                        case 2:
+                            oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1);
+                        // fall through
+                        case 1:
+                            oddvec = vld1q_lane_f32(oddbase, oddvec, 0);
+                            break;
+
+                        default:
+                            UNREACHABLE("Impossible case in switch.");
+                    }
+
+                    // Now load the whole vectors, putting the oddments in when we run out.
+                    do
+                    {
+                        if(vecs == 0)
+                        {
+                            r0 = oddvec;
+                            break;
+                        }
+
+                        r0 = vld1q_f32(y_ptr);
+                        if(--vecs == 0)
+                        {
+                            r1 = oddvec;
+                            break;
+                        }
+
+                        r1 = vld1q_f32(y_ptr + 4);
+                        if(--vecs == 0)
+                        {
+                            r2 = oddvec;
+                            break;
+                        }
+
+                        r2 = vld1q_f32(y_ptr + 8);
+                        if(--vecs == 0)
+                        {
+                            r3 = oddvec;
+                            break;
+                        }
+
+                        r3 = vld1q_f32(y_ptr + 12);
+                        if(--vecs == 0)
+                        {
+                            r4 = oddvec;
+                            break;
+                        }
+
+                        r4 = vld1q_f32(y_ptr + 16);
+                        if(--vecs == 0)
+                        {
+                            r5 = oddvec;
+                            break;
+                        }
+
+                        r5 = vld1q_f32(y_ptr + 20);
+                        if(--vecs == 0)
+                        {
+                            r6 = oddvec;
+                            break;
+                        }
+
+                        r6 = vld1q_f32(y_ptr + 24);
+                        r7 = oddvec;
+                    }
+                    while(0);
+                }
+                else
+                {
+                    // Slightly less slow path - just load the whole vectors
+                    do
+                    {
+                        // It can't be the case that oddbits==0 AND vecs==0 or we wouldn't be here.
+                        if(vecs == 0)
+                        {
+                            UNREACHABLE("Impossible lack of work to do");
+                        }
+
+                        r0 = vld1q_f32(y_ptr);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r1 = vld1q_f32(y_ptr + 4);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r2 = vld1q_f32(y_ptr + 8);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r3 = vld1q_f32(y_ptr + 12);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r4 = vld1q_f32(y_ptr + 16);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r5 = vld1q_f32(y_ptr + 20);
+                        if(--vecs == 0)
+                        {
+                            break;
+                        }
+
+                        r6 = vld1q_f32(y_ptr + 24);
+                    }
+                    while(0);
+                }
+            }
+
+            if(!beta1)
+            {
+                const float32x4_t vb = vdupq_n_f32(beta);
+
+                r0 = vmulq_f32(r0, vb);
+                r1 = vmulq_f32(r1, vb);
+                r2 = vmulq_f32(r2, vb);
+                r3 = vmulq_f32(r3, vb);
+                r4 = vmulq_f32(r4, vb);
+                r5 = vmulq_f32(r5, vb);
+                r6 = vmulq_f32(r6, vb);
+                r7 = vmulq_f32(r7, vb);
+            }
+        }
+
+        if(M >= 8)
+        {
+            int k = (M / 8) - 1;
+            x0    = vld1q_f32(x_ptr);
+
+            __asm __volatile(
+                "ldr    q2, [%[a_ptr], #0]\n"
+                "ldr    q3, [%[a_ptr], #16]\n"
+                "ldr    q4, [%[a_ptr], #32]\n"
+                "ldr    q5, [%[a_ptr], #48]\n"
+                "ldr    q6, [%[a_ptr], #64]\n"
+                "ldr    q7, [%[a_ptr], #80]\n"
+                "ldr    q8, [%[a_ptr], #96]\n"
+                "ldr    q9, [%[a_ptr], #112]\n"
+                "ldr    q10, [%[a_ptr], #128]\n"
+                "ldr    q11, [%[a_ptr], #144]\n"
+                "ldr    q12, [%[a_ptr], #160]\n"
+                "ldr    q13, [%[a_ptr], #176]\n"
+                "ldr    q14, [%[a_ptr], #192]\n"
+                "ldr    q15, [%[a_ptr], #208]\n"
+                "ldr    q16, [%[a_ptr], #224]\n"
+                "ldr    q17, [%[a_ptr], #240]\n"
+                "ldr    q18, [%[a_ptr], #256]\n"
+                "ldr    q19, [%[a_ptr], #272]\n"
+                "ldr    q20, [%[a_ptr], #288]\n"
+                "ldr    q21, [%[a_ptr], #304]\n"
+                "ldr    q22, [%[a_ptr], #320]\n"
+                "ldr    q23, [%[a_ptr], #336]\n" ASM_PREFETCH("[%[a_ptr], #384]")
+                ASM_PREFETCH("[%[a_ptr], #448]")
+                ASM_PREFETCH("[%[a_ptr], #512]")
+                ASM_PREFETCH("[%[a_ptr], #576]")
+                ASM_PREFETCH("[%[a_ptr], #640]")
+                ASM_PREFETCH("[%[a_ptr], #704]")
+                ASM_PREFETCH("[%[a_ptr], #768]")
+                ASM_PREFETCH("[%[a_ptr], #832]")
+                ASM_PREFETCH("[%[a_ptr], #896]")
+                ASM_PREFETCH("[%[a_ptr], #960]")
+                ASM_PREFETCH("[%[a_ptr], #1024]")
+                ASM_PREFETCH("[%[a_ptr], #1088]")
+                ASM_PREFETCH("[%[a_ptr], #1152]")
+                ASM_PREFETCH("[%[a_ptr], #1216]")
+                ASM_PREFETCH("[%[a_ptr], #1280]")
+                ASM_PREFETCH("[%[a_ptr], #1344]")
+                ASM_PREFETCH("[%[a_ptr], #1408]")
+                ASM_PREFETCH("[%[a_ptr], #1472]")
+                ASM_PREFETCH("[%[a_ptr], #1536]")
+                ASM_PREFETCH("[%[a_ptr], #1600]")
+                ASM_PREFETCH("[%[a_ptr], #1664]")
+                ASM_PREFETCH("[%[a_ptr], #1728]")
+                ASM_PREFETCH("[%[a_ptr], #1792]")
+                ASM_PREFETCH("[%[a_ptr], #1856]")
+                ASM_PREFETCH("[%[a_ptr], #1920]")
+                ASM_PREFETCH("[%[a_ptr], #1984]")
+                "add    %[a_ptr], %[a_ptr], #352\n"
+
+                "cbz    %w[k], 2f\n"
+
+                "1:\n"
+                // Unroll 0
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr    %q[x0a], [%[x_ptr], #16]\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr    q3, [%[a_ptr], #0]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr    q4, [%[a_ptr], #16]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr    q5, [%[a_ptr], #32]\n"
+                "add    %[x_ptr], %[x_ptr], #32\n" ASM_PREFETCH("[%[a_ptr], #1664]")
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr    q6, [%[a_ptr], #48]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr    q7, [%[a_ptr], #64]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr    q8, [%[a_ptr], #80]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr    q9, [%[a_ptr], #96]\n" ASM_PREFETCH("[%[a_ptr], #1728]")
+
+                // Unroll 1
+                "fmla    %[r0].4s, v10.4s, %[x0].s[1]\n"
+                "ldr    q10, [%[a_ptr], #112]\n"
+                "fmla    %[r1].4s, v11.4s, %[x0].s[1]\n"
+                "ldr    q11, [%[a_ptr], #128]\n"
+                "fmla    %[r2].4s, v12.4s, %[x0].s[1]\n"
+                "ldr    q12, [%[a_ptr], #144]\n"
+                "fmla    %[r3].4s, v13.4s, %[x0].s[1]\n"
+                "ldr    q13, [%[a_ptr], #160]\n" ASM_PREFETCH("[%[a_ptr], #1792]")
+                "fmla    %[r4].4s, v14.4s, %[x0].s[1]\n"
+                "ldr    q14, [%[a_ptr], #176]\n"
+                "fmla    %[r5].4s, v15.4s, %[x0].s[1]\n"
+                "ldr    q15, [%[a_ptr], #192]\n"
+                "fmla    %[r6].4s, v16.4s, %[x0].s[1]\n"
+                "ldr    q16, [%[a_ptr], #208]\n"
+                "fmla    %[r7].4s, v17.4s, %[x0].s[1]\n"
+                "ldr    q17, [%[a_ptr], #224]\n" ASM_PREFETCH("[%[a_ptr], #1856]")
+
+                // Unroll 2
+                "fmla    %[r0].4s, v18.4s, %[x0].s[2]\n"
+                "ldr    q18, [%[a_ptr], #240]\n"
+                "fmla    %[r1].4s, v19.4s, %[x0].s[2]\n"
+                "ldr    q19, [%[a_ptr], #256]\n"
+                "fmla    %[r2].4s, v20.4s, %[x0].s[2]\n"
+                "ldr    q20, [%[a_ptr], #272]\n"
+                "fmla    %[r3].4s, v21.4s, %[x0].s[2]\n"
+                "ldr    q21, [%[a_ptr], #288]\n" ASM_PREFETCH("[%[a_ptr], #1920]")
+                "fmla    %[r4].4s, v22.4s, %[x0].s[2]\n"
+                "ldr    q22, [%[a_ptr], #304]\n"
+                "fmla    %[r5].4s, v23.4s, %[x0].s[2]\n"
+                "ldr    q23, [%[a_ptr], #320]\n"
+                "fmla    %[r6].4s, v3.4s, %[x0].s[2]\n"
+                "ldr    q2, [%[a_ptr], #336]\n"
+                "ldr    q3, [%[a_ptr], #352]\n"
+                "fmla    %[r7].4s, v4.4s, %[x0].s[2]\n"
+                "ldr    q4, [%[a_ptr], #368]\n" ASM_PREFETCH("[%[a_ptr], #1984]")
+
+                // Unroll 3
+                "fmla    %[r0].4s, v5.4s, %[x0].s[3]\n"
+                "ldr    q5, [%[a_ptr], #384]\n"
+                "fmla    %[r1].4s, v6.4s, %[x0].s[3]\n"
+                "ldr    q6, [%[a_ptr], #400]\n"
+                "fmla    %[r2].4s, v7.4s, %[x0].s[3]\n"
+                "ldr    q7, [%[a_ptr], #416]\n"
+                "fmla    %[r3].4s, v8.4s, %[x0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2048]")
+                "ldr    q8, [%[a_ptr], #432]\n"
+                "fmla    %[r4].4s, v9.4s, %[x0].s[3]\n"
+                "ldr    q9, [%[a_ptr], #448]\n"
+                "fmla    %[r5].4s, v10.4s, %[x0].s[3]\n"
+                "ldr    q10, [%[a_ptr], #464]\n"
+                "fmla    %[r6].4s, v11.4s, %[x0].s[3]\n"
+                "ldr    q11, [%[a_ptr], #480]\n"
+                "fmla    %[r7].4s, v12.4s, %[x0].s[3]\n"
+                "ldr    q12, [%[a_ptr], #496]\n" ASM_PREFETCH("[%[a_ptr], #2112]")
+
+                // Unroll 4
+                "fmla    %[r0].4s, v13.4s, %[x0a].s[0]\n"
+                "ldr    %q[x0], [%[x_ptr]]\n"
+                "fmla    %[r1].4s, v14.4s, %[x0a].s[0]\n"
+                "ldr    q14, [%[a_ptr], #512]\n"
+                "fmla    %[r2].4s, v15.4s, %[x0a].s[0]\n"
+                "ldr    q15, [%[a_ptr], #528]\n"
+                "fmla    %[r3].4s, v16.4s, %[x0a].s[0]\n" ASM_PREFETCH("[%[a_ptr], #2176]")
+                "ldr    q16, [%[a_ptr], #544]\n"
+                "fmla    %[r4].4s, v17.4s, %[x0a].s[0]\n"
+                "ldr    q17, [%[a_ptr], #560]\n"
+                "fmla    %[r5].4s, v18.4s, %[x0a].s[0]\n"
+                "ldr    q18, [%[a_ptr], #576]\n"
+                "fmla    %[r6].4s, v19.4s, %[x0a].s[0]\n"
+                "ldr    q19, [%[a_ptr], #592]\n"
+                "fmla    %[r7].4s, v20.4s, %[x0a].s[0]\n"
+                "ldr    q20, [%[a_ptr], #608]\n" ASM_PREFETCH("[%[a_ptr], #2240]")
+
+                // Unroll 5
+                "fmla    %[r0].4s, v21.4s, %[x0a].s[1]\n"
+                "ldr    q21, [%[a_ptr], #624]\n"
+                "fmla    %[r1].4s, v22.4s, %[x0a].s[1]\n"
+                "ldr    q22, [%[a_ptr], #640]\n"
+                "fmla    %[r2].4s, v23.4s, %[x0a].s[1]\n"
+                "ldr    q23, [%[a_ptr], #656]\n"
+                "fmla    %[r3].4s, v2.4s, %[x0a].s[1]\n"
+                "ldr    q2, [%[a_ptr], #672]\n" ASM_PREFETCH("[%[a_ptr], #2304]")
+                "fmla    %[r4].4s, v3.4s, %[x0a].s[1]\n"
+                "ldr    q3, [%[a_ptr], #688]\n"
+                "fmla    %[r5].4s, v4.4s, %[x0a].s[1]\n"
+                "ldr    q4, [%[a_ptr], #704]\n"
+                "fmla    %[r6].4s, v5.4s, %[x0a].s[1]\n"
+                "ldr    q5, [%[a_ptr], #720]\n"
+                "fmla    %[r7].4s, v6.4s, %[x0a].s[1]\n"
+                "ldr    q6, [%[a_ptr], #736]\n" ASM_PREFETCH("[%[a_ptr], #2368]")
+
+                // Unroll 6
+                "fmla    %[r0].4s, v7.4s, %[x0a].s[2]\n"
+                "ldr    q7, [%[a_ptr], #752]\n"
+                "fmla    %[r1].4s, v8.4s, %[x0a].s[2]\n"
+                "ldr    q8, [%[a_ptr], #768]\n"
+                "fmla    %[r2].4s, v9.4s, %[x0a].s[2]\n"
+                "ldr    q9, [%[a_ptr], #784]\n"
+                "fmla    %[r3].4s, v10.4s, %[x0a].s[2]\n"
+                "ldr    q10, [%[a_ptr], #800]\n" ASM_PREFETCH("[%[a_ptr], #2432]")
+                "fmla    %[r4].4s, v11.4s, %[x0a].s[2]\n"
+                "ldr    q11, [%[a_ptr], #816]\n"
+                "fmla    %[r5].4s, v12.4s, %[x0a].s[2]\n"
+                "ldr    q12, [%[a_ptr], #832]\n"
+                "fmla    %[r6].4s, v14.4s, %[x0a].s[2]\n"
+                "ldr    q13, [%[a_ptr], #848]\n"
+                "ldr    q14, [%[a_ptr], #864]\n"
+                "fmla    %[r7].4s, v15.4s, %[x0a].s[2]\n"
+                "ldr    q15, [%[a_ptr], #880]\n" ASM_PREFETCH("[%[a_ptr], #2496]")
+
+                // Unroll 7
+                "fmla    %[r0].4s, v16.4s, %[x0a].s[3]\n"
+                "ldr    q16, [%[a_ptr], #896]\n"
+                "fmla    %[r1].4s, v17.4s, %[x0a].s[3]\n"
+                "ldr    q17, [%[a_ptr], #912]\n"
+                "fmla    %[r2].4s, v18.4s, %[x0a].s[3]\n"
+                "ldr    q18, [%[a_ptr], #928]\n"
+                "fmla    %[r3].4s, v19.4s, %[x0a].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2560]")
+                "ldr    q19, [%[a_ptr], #944]\n"
+                "fmla    %[r4].4s, v20.4s, %[x0a].s[3]\n"
+                "ldr    q20, [%[a_ptr], #960]\n"
+                "fmla    %[r5].4s, v21.4s, %[x0a].s[3]\n"
+                "ldr    q21, [%[a_ptr], #976]\n"
+                "add    %[a_ptr], %[a_ptr], #1024\n"
+                "fmla    %[r6].4s, v22.4s, %[x0a].s[3]\n"
+                "ldr    q22, [%[a_ptr], #-32]\n"
+                "fmla    %[r7].4s, v23.4s, %[x0a].s[3]\n"
+                "ldr    q23, [%[a_ptr], #-16]\n" ASM_PREFETCH("[%[a_ptr], #1600]")
+                "bne    1b\n"
+
+                // Detached final iteration
+                "2:\n"
+
+                // Unroll 0
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr    %q[x0a], [%[x_ptr], #16]\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr    q3, [%[a_ptr], #0]\n"
+                "subs    %w[k], %w[k], #1\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr    q4, [%[a_ptr], #16]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr    q5, [%[a_ptr], #32]\n"
+                "add    %[x_ptr], %[x_ptr], #32\n"
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr    q6, [%[a_ptr], #48]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr    q7, [%[a_ptr], #64]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr    q8, [%[a_ptr], #80]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr    q9, [%[a_ptr], #96]\n"
+
+                // Unroll 1
+                "fmla    %[r0].4s, v10.4s, %[x0].s[1]\n"
+                "ldr    q10, [%[a_ptr], #112]\n"
+                "fmla    %[r1].4s, v11.4s, %[x0].s[1]\n"
+                "ldr    q11, [%[a_ptr], #128]\n"
+                "fmla    %[r2].4s, v12.4s, %[x0].s[1]\n"
+                "ldr    q12, [%[a_ptr], #144]\n"
+                "fmla    %[r3].4s, v13.4s, %[x0].s[1]\n"
+                "ldr    q13, [%[a_ptr], #160]\n"
+                "fmla    %[r4].4s, v14.4s, %[x0].s[1]\n"
+                "ldr    q14, [%[a_ptr], #176]\n"
+                "fmla    %[r5].4s, v15.4s, %[x0].s[1]\n"
+                "ldr    q15, [%[a_ptr], #192]\n"
+                "fmla    %[r6].4s, v16.4s, %[x0].s[1]\n"
+                "ldr    q16, [%[a_ptr], #208]\n"
+                "fmla    %[r7].4s, v17.4s, %[x0].s[1]\n"
+                "ldr    q17, [%[a_ptr], #224]\n"
+
+                // Unroll 2
+                "fmla    %[r0].4s, v18.4s, %[x0].s[2]\n"
+                "ldr    q18, [%[a_ptr], #240]\n"
+                "fmla    %[r1].4s, v19.4s, %[x0].s[2]\n"
+                "ldr    q19, [%[a_ptr], #256]\n"
+                "fmla    %[r2].4s, v20.4s, %[x0].s[2]\n"
+                "ldr    q20, [%[a_ptr], #272]\n"
+                "fmla    %[r3].4s, v21.4s, %[x0].s[2]\n"
+                "ldr    q21, [%[a_ptr], #288]\n"
+                "fmla    %[r4].4s, v22.4s, %[x0].s[2]\n"
+                "ldr    q22, [%[a_ptr], #304]\n"
+                "fmla    %[r5].4s, v23.4s, %[x0].s[2]\n"
+                "ldr    q23, [%[a_ptr], #320]\n"
+                "fmla    %[r6].4s, v3.4s, %[x0].s[2]\n"
+                "ldr    q2, [%[a_ptr], #336]\n"
+                "ldr    q3, [%[a_ptr], #352]\n"
+                "fmla    %[r7].4s, v4.4s, %[x0].s[2]\n"
+                "ldr    q4, [%[a_ptr], #368]\n"
+
+                // Unroll 3
+                "fmla    %[r0].4s, v5.4s, %[x0].s[3]\n"
+                "ldr    q5, [%[a_ptr], #384]\n"
+                "fmla    %[r1].4s, v6.4s, %[x0].s[3]\n"
+                "ldr    q6, [%[a_ptr], #400]\n"
+                "fmla    %[r2].4s, v7.4s, %[x0].s[3]\n"
+                "ldr    q7, [%[a_ptr], #416]\n"
+                "fmla    %[r3].4s, v8.4s, %[x0].s[3]\n"
+                "ldr    q8, [%[a_ptr], #432]\n"
+                "fmla    %[r4].4s, v9.4s, %[x0].s[3]\n"
+                "ldr    q9, [%[a_ptr], #448]\n"
+                "fmla    %[r5].4s, v10.4s, %[x0].s[3]\n"
+                "ldr    q10, [%[a_ptr], #464]\n"
+                "fmla    %[r6].4s, v11.4s, %[x0].s[3]\n"
+                "ldr    q11, [%[a_ptr], #480]\n"
+                "fmla    %[r7].4s, v12.4s, %[x0].s[3]\n"
+                "ldr    q12, [%[a_ptr], #496]\n"
+
+                // Unroll 4
+                "fmla    %[r0].4s, v13.4s, %[x0a].s[0]\n"
+                "fmla    %[r1].4s, v14.4s, %[x0a].s[0]\n"
+                "ldr    q14, [%[a_ptr], #512]\n"
+                "fmla    %[r2].4s, v15.4s, %[x0a].s[0]\n"
+                "ldr    q15, [%[a_ptr], #528]\n"
+                "fmla    %[r3].4s, v16.4s, %[x0a].s[0]\n"
+                "ldr    q16, [%[a_ptr], #544]\n"
+                "fmla    %[r4].4s, v17.4s, %[x0a].s[0]\n"
+                "ldr    q17, [%[a_ptr], #560]\n"
+                "fmla    %[r5].4s, v18.4s, %[x0a].s[0]\n"
+                "ldr    q18, [%[a_ptr], #576]\n"
+                "fmla    %[r6].4s, v19.4s, %[x0a].s[0]\n"
+                "ldr    q19, [%[a_ptr], #592]\n"
+                "fmla    %[r7].4s, v20.4s, %[x0a].s[0]\n"
+                "ldr    q20, [%[a_ptr], #608]\n"
+
+                // Unroll 5
+                "fmla    %[r0].4s, v21.4s, %[x0a].s[1]\n"
+                "ldr    q21, [%[a_ptr], #624]\n"
+                "fmla    %[r1].4s, v22.4s, %[x0a].s[1]\n"
+                "ldr    q22, [%[a_ptr], #640]\n"
+                "fmla    %[r2].4s, v23.4s, %[x0a].s[1]\n"
+                "ldr    q23, [%[a_ptr], #656]\n"
+                "fmla    %[r3].4s, v2.4s, %[x0a].s[1]\n"
+                "add    %[a_ptr], %[a_ptr], #672\n"
+                "fmla    %[r4].4s, v3.4s, %[x0a].s[1]\n"
+                "fmla    %[r5].4s, v4.4s, %[x0a].s[1]\n"
+                "fmla    %[r6].4s, v5.4s, %[x0a].s[1]\n"
+                "fmla    %[r7].4s, v6.4s, %[x0a].s[1]\n"
+
+                // Unroll 6
+                "fmla    %[r0].4s, v7.4s, %[x0a].s[2]\n"
+                "fmla    %[r1].4s, v8.4s, %[x0a].s[2]\n"
+                "fmla    %[r2].4s, v9.4s, %[x0a].s[2]\n"
+                "fmla    %[r3].4s, v10.4s, %[x0a].s[2]\n"
+                "fmla    %[r4].4s, v11.4s, %[x0a].s[2]\n"
+                "fmla    %[r5].4s, v12.4s, %[x0a].s[2]\n"
+                "fmla    %[r6].4s, v14.4s, %[x0a].s[2]\n"
+                "fmla    %[r7].4s, v15.4s, %[x0a].s[2]\n"
+
+                // Unroll 7
+                "fmla    %[r0].4s, v16.4s, %[x0a].s[3]\n"
+                "fmla    %[r1].4s, v17.4s, %[x0a].s[3]\n"
+                "fmla    %[r2].4s, v18.4s, %[x0a].s[3]\n"
+                "fmla    %[r3].4s, v19.4s, %[x0a].s[3]\n"
+                "fmla    %[r4].4s, v20.4s, %[x0a].s[3]\n"
+                "fmla    %[r5].4s, v21.4s, %[x0a].s[3]\n"
+                "fmla    %[r6].4s, v22.4s, %[x0a].s[3]\n"
+                "fmla    %[r7].4s, v23.4s, %[x0a].s[3]\n"
+                :
+                [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+                [x0] "+w"(x0), [x0a] "+w"(x0a), [k] "+r"(k),
+                [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+                [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+                :
+                : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+                "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory");
+        }
+
+        // Deal with ragged M
+        if(M % 8)
+        {
+            int l = (M % 8) - 1;
+
+            __asm __volatile(
+                "ldr    q2, [%[a_ptr], #0]\n"
+                "ldr    q3, [%[a_ptr], #16]\n"
+                "ldr    q4, [%[a_ptr], #32]\n"
+                "ldr    q5, [%[a_ptr], #48]\n"
+                "ldr    q6, [%[a_ptr], #64]\n"
+                "ldr    q7, [%[a_ptr], #80]\n"
+                "ldr    q8, [%[a_ptr], #96]\n"
+                "ldr    q9, [%[a_ptr], #112]\n"
+                "ldr    %s[x0], [%[x_ptr]]\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "add    %[x_ptr], %[x_ptr], #4\n"
+
+                "cbz    %w[l], 2f\n"
+
+                "1:\n"
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "ldr    q2, [%[a_ptr], #0]\n"
+                "subs    %w[l], %w[l], #1\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "ldr    q3, [%[a_ptr], #16]\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "ldr    q4, [%[a_ptr], #32]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "ldr    q5, [%[a_ptr], #48]\n"
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "ldr    q6, [%[a_ptr], #64]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "ldr    q7, [%[a_ptr], #80]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "ldr    q8, [%[a_ptr], #96]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                "ldr    q9, [%[a_ptr], #112]\n"
+                "ldr    %s[x0], [%[x_ptr]]\n"
+                "add    %[a_ptr], %[a_ptr], #128\n"
+                "add    %[x_ptr], %[x_ptr], #4\n"
+                "bne    1b\n"
+
+                "2:\n"
+
+                "fmla    %[r0].4s, v2.4s, %[x0].s[0]\n"
+                "fmla    %[r1].4s, v3.4s, %[x0].s[0]\n"
+                "fmla    %[r2].4s, v4.4s, %[x0].s[0]\n"
+                "fmla    %[r3].4s, v5.4s, %[x0].s[0]\n"
+                "fmla    %[r4].4s, v6.4s, %[x0].s[0]\n"
+                "fmla    %[r5].4s, v7.4s, %[x0].s[0]\n"
+                "fmla    %[r6].4s, v8.4s, %[x0].s[0]\n"
+                "fmla    %[r7].4s, v9.4s, %[x0].s[0]\n"
+                :
+                [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+                [x0] "+w"(x0), [l] "+r"(l),
+                [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+                [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+                :
+                : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory");
+        }
+
+        if(l == 32)
+        {
+            // Fast path
+            vst1q_f32(y_ptr, r0);
+            vst1q_f32(y_ptr + 4, r1);
+            vst1q_f32(y_ptr + 8, r2);
+            vst1q_f32(y_ptr + 12, r3);
+            vst1q_f32(y_ptr + 16, r4);
+            vst1q_f32(y_ptr + 20, r5);
+            vst1q_f32(y_ptr + 24, r6);
+            vst1q_f32(y_ptr + 28, r7);
+        }
+        else
+        {
+            int vecs    = l / 4;
+            int oddbits = l % 4;
+
+            if(oddbits)
+            {
+                // As above - slowest path deals with vectors plus odd bits
+                float32x4_t oddvec;
+
+                do
+                {
+                    if(vecs == 0)
+                    {
+                        oddvec = r0;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr, r0);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r1;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 4, r1);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r2;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 8, r2);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r3;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 12, r3);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r4;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 16, r4);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r5;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 20, r5);
+                    if(--vecs == 0)
+                    {
+                        oddvec = r6;
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 24, r6);
+                    oddvec = r7;
+                }
+                while(0);
+
+                float *oddbase = y_ptr + l - oddbits;
+
+                switch(oddbits)
+                {
+                    case 3:
+                        vst1q_lane_f32(oddbase + 2, oddvec, 2);
+                    // fall through
+                    case 2:
+                        vst1q_lane_f32(oddbase + 1, oddvec, 1);
+                    // fall through
+                    case 1:
+                        vst1q_lane_f32(oddbase, oddvec, 0);
+                        break;
+
+                    default:
+                        // oddbits must be 1, 2 or 3.
+                        UNREACHABLE("Impossible case in switch.");
+                }
+            }
+            else
+            {
+                // As above - medium path deals with vectors only
+                do
+                {
+                    if(vecs == 0)
+                    {
+                        UNREACHABLE("vecs and oddbits can't both be 0");
+                    }
+
+                    vst1q_f32(y_ptr, r0);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 4, r1);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 8, r2);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 12, r3);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 16, r4);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 20, r5);
+                    if(--vecs == 0)
+                    {
+                        break;
+                    }
+
+                    vst1q_f32(y_ptr + 24, r6);
+                }
+                while(0);
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
new file mode 100644
index 0000000..5b9bd72
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
+
+// Transposed SGEMV strategy class.
+class sgemv_trans
+{
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
+
+    /* Kernel blocking parameters */
+    static const int out_width = 96;
+    static const int k_unroll  = 1;
+
+    kern_type kernel = a64_sgemv_trans;
+
+    sgemv_trans(const CPUInfo *ci)
+    {
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
new file mode 100644
index 0000000..3309baf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+// Kernel implementation - transposed GEMV
+//
+// The kernel will process "M" rows of A (= steps of dot product) and "N"
+// columns (= dot products total)
+//
+// General plan is to do as many columns simultaneously as possible - a
+// reasonable limit is half the NEON regfile = 64 total accumulators.
+//
+// It's possible that messing around with sub-blocking M and N can yield
+// higher performance, but that's left to the outer loop.  In this kernel we
+// process all of M at the same time.
+
+// How far ahead to prefetch for the first and subsequent prefetches.
+// These values work for A72 on JunoR2...
+
+#define FIRST_PFD 9
+#define PFD 6
+
+namespace arm_gemm
+{
+void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N)
+{
+    const float *a_ptr_base = Astart;
+    float       *y_ptr      = Ystart;
+
+    register const float32x4_t va asm("v1") = vdupq_n_f32(alpha);
+
+    int firstpfd = FIRST_PFD;
+    if(firstpfd > M)
+    {
+        firstpfd = (M - 1);
+    }
+
+    int pfd = PFD;
+    if(pfd > M)
+    {
+        pfd = (M - 1);
+    }
+
+    ptrdiff_t jump = lda * sizeof(int);
+
+    for(; N >= 96; N -= 96)
+    {
+        int k = M - 1;
+
+        const float *a_ptr       = a_ptr_base;
+        const float *x_ptr       = Xstart;
+        const float *pf_ptr      = a_ptr;
+        const float *firstpf_ptr = a_ptr;
+        const float *pf_limit    = a_ptr + (M * lda);
+
+        for(int i = 0; i < firstpfd; i++)
+        {
+            prefetch_1x(firstpf_ptr);
+            firstpf_ptr += lda;
+        }
+
+        for(int i = 0; i < pfd; i++)
+        {
+            prefetch_5x(pf_ptr + 16);
+            pf_ptr += lda;
+        }
+
+        a_ptr_base += 96;
+
+        __asm __volatile(
+            "movi    v8.4s,#0x0\n"
+            "ldr    w0, [%[x_ptr]]\n"
+            "movi    v9.4s,#0x0\n"
+            "ldr    q2,  [%[a_ptr], #0]\n"
+            "movi    v10.4s,#0x0\n"
+            "ldr    q3,  [%[a_ptr], #0x10]\n"
+            "movi    v11.4s,#0x0\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "movi    v12.4s,#0x0\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n"
+            "movi    v13.4s,#0x0\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "movi    v14.4s,#0x0\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "movi    v15.4s,#0x0\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+            "movi    v16.4s, #0x0\n"
+            "movi    v17.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #64]")
+            "movi    v18.4s, #0x0\n"
+            "movi    v19.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #128]")
+            "movi    v20.4s, #0x0\n"
+            "movi    v21.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #192]")
+            "movi    v22.4s, #0x0\n"
+            "movi    v23.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #256]")
+            "movi    v24.4s, #0x0\n"
+            "movi    v25.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #320]")
+            "movi    v26.4s, #0x0\n"
+            "movi    v27.4s, #0x0\n"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "movi    v28.4s, #0x0\n"
+            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "movi    v29.4s, #0x0\n"
+            "movi    v30.4s, #0x0\n"
+            "movi    v31.4s, #0x0\n"
+
+            // Skip everything if there are no iterations of the main loop to do.
+            "cbz    %w[k], 10f\n"
+
+            // Loop with all prefetches.  Exit this loop when firstpf_ptr
+            // hits pf_limit.
+            "1:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #0x4\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "sub    %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x00]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "cmp    %[firstpf_ptr], %[pf_limit]\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "blt    1b\n"
+
+            // Check that there are still "main" prefetches to do.
+            "cmp    %[pf_ptr], %[pf_limit]\n"
+            "bge    9f\n"
+
+            // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
+            "8:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #0x4\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "sub    %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x00]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "cmp    %[pf_ptr], %[pf_limit]\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "blt    8b\n"
+
+            // Check that there is still work to do.
+            "9:\n"
+            "cmp    %w[k], #0\n"
+            "beq    10f\n"
+
+            // Loop without prefetches, exit when k hits 0.
+            "2:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #0x4\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "subs    %w[k], %w[k], #1\n"
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n"
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n"
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n"
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n"
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x00]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x30]\n"
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x50]\n"
+            "bne    2b\n"
+
+            "10:\n"
+
+            // Final iteration
+            "dup    v0.4s, w0\n"
+            "fmla    v8.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x60]\n"
+            "fmla    v9.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x70]\n"
+            "fmla    v10.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x80]\n"
+            "fmla    v11.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x90]\n"
+            "fmla    v12.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0xa0]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0xb0]\n"
+            "fmla    v14.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0xc0]\n"
+            "fmla    v15.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0xd0]\n"
+            "fmla    v16.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0xe0]\n"
+            "fmla    v17.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0xf0]\n"
+            "fmla    v18.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x100]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x110]\n"
+            "fmla    v20.4s, v2.4s, v0.4s\n"
+            "ldr    q2, [%[a_ptr], #0x120]\n"
+            "fmla    v21.4s, v3.4s, v0.4s\n"
+            "ldr    q3, [%[a_ptr], #0x130]\n"
+            "fmla    v22.4s, v4.4s, v0.4s\n"
+            "ldr    q4, [%[a_ptr], #0x140]\n"
+            "fmla    v23.4s, v5.4s, v0.4s\n"
+            "ldr    q5, [%[a_ptr], #0x150]\n"
+            "fmla    v24.4s, v6.4s, v0.4s\n"
+            "ldr    q6, [%[a_ptr], #0x160]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "ldr    q7, [%[a_ptr], #0x170]\n"
+            "fmla    v26.4s, v2.4s, v0.4s\n"
+            "ldr    q2,  [%[y_ptr]]\n"
+            "fmla    v27.4s, v3.4s, v0.4s\n"
+            "ldr    q3,  [%[y_ptr], #0x10]\n"
+            "fmla    v28.4s, v4.4s, v0.4s\n"
+            "ldr    q4,  [%[y_ptr], #0x20]\n"
+            "fmla    v29.4s, v5.4s, v0.4s\n"
+            "ldr    q5,  [%[y_ptr], #0x30]\n"
+            "fmla    v30.4s, v6.4s, v0.4s\n"
+            "ldr    q6,  [%[y_ptr], #0x40]\n"
+            "fmla    v31.4s, v7.4s, v0.4s\n"
+            "ldr    q7,  [%[y_ptr], #0x50]\n"
+
+            "fmla    v2.4s, v8.4s, %[va].4s\n"
+            "ldr    q8, [%[y_ptr], #0x60]\n"
+            "fmla    v3.4s, v9.4s, %[va].4s\n"
+            "ldr    q9, [%[y_ptr], #0x70]\n"
+            "fmla    v4.4s, v10.4s, %[va].4s\n"
+            "ldr    q10, [%[y_ptr], #0x80]\n"
+            "fmla    v5.4s, v11.4s, %[va].4s\n"
+            "ldr    q11, [%[y_ptr], #0x90]\n"
+            "fmla    v6.4s, v12.4s, %[va].4s\n"
+            "ldr    q12, [%[y_ptr], #0xa0]\n"
+            "str    q2, [%[y_ptr], #0x00]\n"
+            "fmla    v7.4s, v13.4s, %[va].4s\n"
+            "ldr    q13, [%[y_ptr], #0xb0]\n"
+            "str    q3, [%[y_ptr], #0x10]\n"
+            "fmla    v8.4s, v14.4s, %[va].4s\n"
+            "ldr    q14, [%[y_ptr], #0xc0]\n"
+            "str    q4, [%[y_ptr], #0x20]\n"
+            "fmla    v9.4s, v15.4s, %[va].4s\n"
+            "ldr    q15, [%[y_ptr], #0xd0]\n"
+            "str    q5, [%[y_ptr], #0x30]\n"
+            "fmla    v10.4s, v16.4s, %[va].4s\n"
+            "ldr    q16, [%[y_ptr], #0xe0]\n"
+            "str    q6, [%[y_ptr], #0x40]\n"
+            "fmla    v11.4s, v17.4s, %[va].4s\n"
+            "ldr    q17, [%[y_ptr], #0xf0]\n"
+            "str    q7, [%[y_ptr], #0x50]\n"
+            "fmla    v12.4s, v18.4s, %[va].4s\n"
+            "ldr    q18, [%[y_ptr], #0x100]\n"
+            "str    q8, [%[y_ptr], #0x60]\n"
+            "fmla    v13.4s, v19.4s, %[va].4s\n"
+            "ldr    q19, [%[y_ptr], #0x110]\n"
+            "str    q9, [%[y_ptr], #0x70]\n"
+            "fmla    v14.4s, v20.4s, %[va].4s\n"
+            "ldr    q20, [%[y_ptr], #0x120]\n"
+            "str    q10, [%[y_ptr], #0x80]\n"
+            "fmla    v15.4s, v21.4s, %[va].4s\n"
+            "ldr    q21, [%[y_ptr], #0x130]\n"
+            "str    q11, [%[y_ptr], #0x90]\n"
+            "fmla    v16.4s, v22.4s, %[va].4s\n"
+            "ldr    q22, [%[y_ptr], #0x140]\n"
+            "str    q12, [%[y_ptr], #0xa0]\n"
+            "fmla    v17.4s, v23.4s, %[va].4s\n"
+            "ldr    q23, [%[y_ptr], #0x150]\n"
+            "str    q13, [%[y_ptr], #0xb0]\n"
+            "fmla    v18.4s, v24.4s, %[va].4s\n"
+            "ldr    q24, [%[y_ptr], #0x160]\n"
+            "str    q14, [%[y_ptr], #0xc0]\n"
+            "fmla    v19.4s, v25.4s, %[va].4s\n"
+            "ldr    q25, [%[y_ptr], #0x170]\n"
+            "str    q15, [%[y_ptr], #0xd0]\n"
+            "fmla    v20.4s, v26.4s, %[va].4s\n"
+            "str    q16, [%[y_ptr], #0xe0]\n"
+            "fmla    v21.4s, v27.4s, %[va].4s\n"
+            "str    q17, [%[y_ptr], #0xf0]\n"
+            "fmla    v22.4s, v28.4s, %[va].4s\n"
+            "str    q18, [%[y_ptr], #0x100]\n"
+            "fmla    v23.4s, v29.4s, %[va].4s\n"
+            "str    q19, [%[y_ptr], #0x110]\n"
+            "fmla    v24.4s, v30.4s, %[va].4s\n"
+            "str    q20, [%[y_ptr], #0x120]\n"
+            "fmla    v25.4s, v31.4s, %[va].4s\n"
+            "str    q21, [%[y_ptr], #0x130]\n"
+
+            "stp    q22, q23, [%[y_ptr], #0x140]\n"
+            "stp    q24, q25, [%[y_ptr], #0x160]\n"
+            "add    %[y_ptr], %[y_ptr], #0x180\n"
+
+            : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr)
+            : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit)
+            : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+            "v27", "v28", "v29", "v30", "v31", "cc");
+    }
+
+    if(N > 0)
+    {
+        // Handle N tail - up to 95 stragglers.
+        // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
+        // single value for the remainder.
+
+        // Independent pointers into the matrix for the odd 2 and odd 1.
+        // Double up as flag to indicate whether they are needed.
+        const float *odd2_aptr = NULL;
+        const float *odd1_aptr = NULL;
+
+        // Figure out how much work we need to do.
+        int numvecs = N / 4;
+        int rem     = N % 4;
+        int k       = M;
+
+        // Set up pointers for the odd 2/1 if needed.
+        if(rem >= 2)
+        {
+            odd2_aptr = a_ptr_base + (numvecs * 4);
+        }
+
+        if(rem & 1)
+        {
+            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr == NULL ? 0 : 2);
+        }
+
+        const float *a_ptr       = a_ptr_base;
+        const float *firstpf_ptr = a_ptr_base;
+        const float *pf_ptr      = a_ptr_base;
+        const float *pf_limit    = a_ptr + (M * lda);
+
+        const float *x_ptr = Xstart;
+        int          vecs  = 0; // Working variable to count how many vectors to work on.
+        int          dopf  = 1; // Track whether we are doing prefetches.
+
+        // Figure out how many cache lines we need to prefetch each time.
+        int numpfs = (N + 15) / 16;
+
+        // Do initial prefetches
+        for(int i = 0; i < firstpfd + 1; i++)
+        {
+            prefetch_1x(firstpf_ptr);
+            firstpf_ptr += lda;
+        }
+
+        // Do "main" prefetches - adapt number to the number we actually need.
+        if(numpfs > 1)
+        {
+            for(int i = 0; i < pfd + 1; i++)
+            {
+                switch(numpfs)
+                {
+                    case 2:
+                        prefetch_1x(pf_ptr + 16);
+                        break;
+
+                    case 3:
+                        prefetch_2x(pf_ptr + 16);
+                        break;
+
+                    case 4:
+                        prefetch_3x(pf_ptr + 16);
+                        break;
+
+                    case 5:
+                        prefetch_4x(pf_ptr + 16);
+                        break;
+
+                    case 6:
+                        prefetch_5x(pf_ptr + 16);
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+                pf_ptr += lda;
+            }
+        }
+        else
+        {
+            // Just disable additional prefetches
+            dopf = 0;
+        }
+
+        // Do the real work
+        __asm __volatile(
+            // Initialize all the vectors - not worth skipping this if only
+            // some are needed.
+            "movi    v8.4s,#0x0\n"
+            "ldr    w0, [%[x_ptr]]\n"
+            "movi    v9.4s,#0x0\n"
+            "movi    v10.4s,#0x0\n"
+            "movi    v11.4s,#0x0\n"
+            "movi    v12.4s,#0x0\n"
+            "movi    v13.4s,#0x0\n"
+            "movi    v14.4s,#0x0\n"
+            "movi    v15.4s,#0x0\n"
+            "movi    v16.4s, #0x0\n"
+            "movi    v17.4s, #0x0\n"
+            "movi    v18.4s, #0x0\n"
+            "movi    v19.4s, #0x0\n"
+            "movi    v20.4s, #0x0\n"
+            "movi    v21.4s, #0x0\n"
+            "movi    v22.4s, #0x0\n"
+            "movi    v23.4s, #0x0\n"
+            "movi    v24.4s, #0x0\n"
+            "movi    v25.4s, #0x0\n"
+            "movi    v26.4s, #0x0\n"
+            "movi    v27.4s, #0x0\n"
+            "movi    v28.4s, #0x0\n"
+            "movi    v29.4s, #0x0\n"
+            "movi    v30.4s, #0x0\n"
+            "movi    v6.2s, #0x0\n"
+            "movi    v5.2s, #0x0\n"
+
+            "1:\n" ASM_PREFETCH("[%[firstpf_ptr]]\n")
+            "11:\n"
+            "dup    v0.4s, w0\n"
+            "ldr    w0, [%[x_ptr], #4]\n"
+            "add    %[x_ptr], %[x_ptr], #4\n"
+
+            "cbz    %w[numvecs], 2f\n"
+            "mov    %w[vecs], %w[numvecs]\n"
+
+            // Vector 0
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x00]\n"
+            "fmla    v8.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 1
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x10]\n"
+            "fmla    v9.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 2
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x20]\n"
+            "fmla    v10.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 3
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x30]\n"
+            "fmla    v11.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 3f\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+            "3:\n"
+            "beq    2f\n"
+
+            // Vector 4
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x40]\n"
+            "fmla    v12.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 5
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x50]\n"
+            "fmla    v13.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 6
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x60]\n"
+            "fmla    v14.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 7
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x70]\n"
+            "fmla    v15.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 4f\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+            "4:\n"
+            "beq    2f\n"
+
+            // Vector 8
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x80]\n"
+            "fmla    v16.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 9
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x90]\n"
+            "fmla    v17.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 10
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xa0]\n"
+            "fmla    v18.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 11
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xb0]\n"
+            "fmla    v19.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 5f\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+            "5:\n"
+            "beq    2f\n"
+
+            // Vector 12
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xc0]\n"
+            "fmla    v20.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 13
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xd0]\n"
+            "fmla    v21.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 14
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xe0]\n"
+            "fmla    v22.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 15
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0xf0]\n"
+            "fmla    v23.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 6f\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+            "6:\n"
+            "beq    2f\n"
+
+            // Vector 16
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x100]\n"
+            "fmla    v24.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 17
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x110]\n"
+            "fmla    v25.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 18
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x120]\n"
+            "fmla    v26.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 19
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x130]\n"
+            "fmla    v27.4s, v7.4s, v0.4s\n"
+            // Prefetch
+            "cbz    %w[dopf], 7f\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+            "7:\n"
+            "beq    2f\n"
+
+            // Vector 20
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x140]\n"
+            "fmla    v28.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 21
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x150]\n"
+            "fmla    v29.4s, v7.4s, v0.4s\n"
+            "beq    2f\n"
+            // Vector 22
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7,[%[a_ptr], #0x160]\n"
+            "fmla    v30.4s, v7.4s, v0.4s\n"
+
+            "2:\n"
+            "add    %[a_ptr], %[a_ptr], %[jump]\n"
+
+            // Do the odd 2-vector, if needed
+            "cbz    %[odd2_aptr], 8f\n"
+            "ldr    d7, [%[odd2_aptr]]\n"
+            "fmla    v6.2s, v7.2s, v0.2s\n"
+            "add    %[odd2_aptr], %[odd2_aptr], %[jump]\n"
+
+            "8:\n"
+            // Do the odd 1-vector, if needed
+            "cbz    %[odd1_aptr], 9f\n"
+            "ldr    s7, [%[odd1_aptr]]\n"
+            "fmla    v5.2s, v7.2s, v0.2s\n"
+            "add    %[odd1_aptr], %[odd1_aptr], %[jump]\n"
+
+            // Get out if needed.
+            "9:\n"
+            "subs    %w[k], %w[k], #1\n"
+            "beq    10f\n"
+
+            // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
+            "add    %[pf_ptr], %[pf_ptr], %[jump]\n"
+            "cmp    %[pf_ptr], %[pf_limit]\n"
+            "csel    %w[dopf], %w[dopf], WZR, LT\n"
+
+            // Update the "leading" prefetch pointer, don't do the first
+            // instruction of the loop if it's over the limit.
+            "add    %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+            "cmp    %[firstpf_ptr], %[pf_limit]\n"
+            "blt    1b\n"
+            "b        11b\n"
+
+            // Now write out the outputs
+            "10:\n"
+            "cbz    %w[numvecs], 12f\n"
+            "mov    %w[vecs], %w[numvecs]\n"
+
+            // Vector 0
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v8.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 1
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v9.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 2
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v10.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 3
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v11.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 4
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v12.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 5
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v13.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 6
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v14.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 7
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v15.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 8
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v16.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 9
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v17.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 10
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v18.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 11
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v19.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 12
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v20.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 13
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v21.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 14
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v22.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 15
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v23.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 16
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v24.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 17
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v25.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 18
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v26.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 19
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v27.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 20
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v28.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 21
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v29.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+            "beq    12f\n"
+            // Vector 22
+            "subs    %w[vecs], %w[vecs], #1\n"
+            "ldr    q7, [%[y_ptr]]\n"
+            "fmla    v7.4s, v30.4s, %[va].4s\n"
+            "str    q7, [%[y_ptr]], #0x10\n"
+
+            // Odd 2
+            "12:\n"
+            "cbz    %[odd2_aptr], 13f\n"
+            "ldr    d7, [%[y_ptr]]\n"
+            "fmla    v7.2s, v6.2s, %[va].2s\n"
+            "str    d7, [%[y_ptr]], #0x8\n"
+
+            // Odd 1
+            "13:\n"
+            "cbz    %[odd1_aptr], 14f\n"
+            "ldr    s7, [%[y_ptr]]\n"
+            "fmla    v7.2s, v5.2s, %[va].2s\n"
+            "str    s7, [%[y_ptr]]\n"
+
+            "14:\n"
+            : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k),
+            [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr),
+            [odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr),
+            [dopf] "+r"(dopf), [vecs] "+r"(vecs)
+            : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
+            : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+            "v27", "v28", "v29", "v30", "v31", "cc");
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
new file mode 100644
index 0000000..2ab01d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here.  */
+#include <arm_neon.h>
+
+#include "asmlib.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm
+{
+template <unsigned int width, unsigned int height, typename Tin, typename Tout>
+inline void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta)
+{
+    int full_y_blocks = (ymax - y0) / height;
+    int y_remainder   = (ymax - y0) % height;
+    int y_blocks      = full_y_blocks + (y_remainder ? 1 : 0);
+
+    int full_x_blocks = (xmax - x0) / width;
+    int x_remainder   = (xmax - x0) % width;
+    int x_blocks      = full_x_blocks + (x_remainder ? 1 : 0);
+
+    for(int y_block = 0; y_block < y_blocks; y_block++)
+    {
+        int ybase = y0 + (y_block * height);
+
+        int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
+
+        for(int x_block = 0; x_block < x_blocks; x_block++)
+        {
+            int xbase = x0 + (x_block * width);
+
+            int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
+
+            for(int row = 0; row < fill_rows; row++)
+            {
+                for(int col = 0; col < fill_cols; col++)
+                {
+                    Tout &p = out[(ybase + row) * ldc + xbase + col];
+
+                    p = (p * alpha) + (beta * in[row * width + col]);
+                }
+            }
+
+            in += (width * height);
+        }
+    }
+}
+
+#include "merges/list.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
new file mode 100644
index 0000000..b44e564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+
+        for(int i = x0; i < xmax; i += 8)
+        {
+            float dummyres[8];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 5) >= ymax)
+            {
+                switch((y + 5) - ymax)
+                {
+                    case 4:
+                        outptr1 = dummyres;
+                    case 3:
+                        outptr2 = dummyres;
+                    case 2:
+                        outptr3 = dummyres;
+                    case 1:
+                        outptr4 = dummyres;
+                    case 0:
+                        outptr5 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 7) >= xmax)
+            {
+                for(int xi = 0; xi < 8; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
+                        outptr5++;
+                    }
+                }
+                inptr += 48;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Rows 0-1
+                    "VLD1.32    {d8-d11},  [%[outptr0]]\n"
+                    "VMUL.f32    q4, q4, %q[bv]\n"
+                    "VLD1.32    {d12-d15}, [%[outptr1]]\n"
+                    "VMUL.f32    q5, q5, %q[bv]\n"
+                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
+                    "VMUL.f32    q6, q6, %q[bv]\n"
+                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
+                    "VMUL.f32    q7, q7, %q[bv]\n"
+
+                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[inptr], #352]")
+                    "VMLA.f32    q5, q1, %q[av]\n"
+                    "VST1.32    {d8-d11}, [%[outptr0]]!\n" ASM_PREFETCH("[%[inptr], #416]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[inptr], #480]")
+                    "VMLA.f32    q7, q3, %q[av]\n"
+                    "VST1.32    {d12-d15}, [%[outptr1]]!\n"
+
+                    // Rows 2-3
+                    "VLD1.32    {d8-d11},  [%[outptr2]]\n"
+                    "VMUL.f32    q4, q4, %q[bv]\n"
+                    "VLD1.32    {d12-d15}, [%[outptr3]]\n"
+                    "VMUL.f32    q5, q5, %q[bv]\n"
+                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
+                    "VMUL.f32    q6, q6, %q[bv]\n"
+                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
+                    "VMUL.f32    q7, q7, %q[bv]\n"
+
+                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr0], #96]")
+                    "VMLA.f32    q5, q1, %q[av]\n"
+                    "VST1.32    {d8-d11}, [%[outptr2]]!\n" ASM_PREFETCH("[%[outptr1], #96]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr2], #96]")
+                    "VMLA.f32    q7, q3, %q[av]\n"
+                    "VST1.32    {d12-d15}, [%[outptr3]]!\n"
+
+                    // Rows 4-5
+                    "VLD1.32    {d8-d11},  [%[outptr4]]\n"
+                    "VMUL.f32    q4, q4, %q[bv]\n"
+                    "VLD1.32    {d12-d15}, [%[outptr5]]\n"
+                    "VMUL.f32    q5, q5, %q[bv]\n"
+                    "VLD1.32    {d0-d3},   [%[inptr]]!\n"
+                    "VMUL.f32    q6, q6, %q[bv]\n"
+                    "VLD1.32    {d4-d7},   [%[inptr]]!\n"
+                    "VMUL.f32    q7, q7, %q[bv]\n"
+
+                    "VMLA.f32    q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr3], #96]")
+                    "VMLA.f32    q5, q1, %q[av]\n"
+                    "VST1.32    {d8-d11}, [%[outptr4]]!\n" ASM_PREFETCH("[%[outptr4], #96]") "VMLA.f32    q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr5], #128]")
+                    "VMLA.f32    q7, q3, %q[av]\n"
+                    "VST1.32    {d12-d15}, [%[outptr5]]!\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [inptr] "+r"(inptr)
+                    : [av] "w"(av), [bv] "w"(bv)
+                    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+            }
+        }
+    }
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000..3b59a43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 12)
+        {
+            float dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 11) >= xmax)
+            {
+                for(int xi = 0; xi < 12; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Rows 0-1
+                    "LDP    q16, q17, [%[outptr0]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr0], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr1]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr1], #32]\n" ASM_PREFETCH("[%[inptr], #768]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr]]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #32]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #64]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #832]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr0]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr0]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr1]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr1]], #16\n"
+
+                    // Rows 2-3
+                    "LDP    q16, q17, [%[outptr2]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr2], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr3]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr3], #32]\n" ASM_PREFETCH("[%[inptr], #960]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #96]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #128]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #160]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1024]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr2]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr2]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1088]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr3]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr3]], #16\n"
+
+                    // Rows 4-5
+                    ASM_PREFETCH("[%[outptr0], #80]")
+                    "LDP    q16, q17, [%[outptr4]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr4], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr5]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr5], #32]\n" ASM_PREFETCH("[%[outptr1], #80]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #192]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #224]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #256]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr2], #80]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr4]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr4]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr3], #80]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr5]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr5]], #16\n"
+
+                    // Rows 6-7
+                    ASM_PREFETCH("[%[outptr4], #80]")
+                    "LDP    q16, q17, [%[outptr6]]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR    q18, [%[outptr6], #32]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP    q19, q20, [%[outptr7]]\n"
+                    "FMUL    v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR    q21, [%[outptr7], #32]\n" ASM_PREFETCH("[%[outptr5], #80]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #288]\n"
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP    q2,  q3,  [%[inptr], #320]\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP    q4,  q5,  [%[inptr], #352]\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr6], #128]")
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "STP    q16, q17, [%[outptr6]], #32\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q18, [%[outptr6]], #16\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr7], #128]")
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "STP    q19, q20, [%[outptr7]], #32\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "STR    q21, [%[outptr7]], #16\n"
+                    "ADD    %[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [av] "w"(av), [bv] "w"(bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+            }
+        }
+    }
+}
+
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
new file mode 100644
index 0000000..12a0901
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<12, 8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta)
+{
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 24);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        __fp16 *outptr0 = out + (y * ldout) + x0;
+        __fp16 *outptr1 = outptr0 + ldout;
+        __fp16 *outptr2 = outptr1 + ldout;
+        __fp16 *outptr3 = outptr2 + ldout;
+        __fp16 *outptr4 = outptr3 + ldout;
+        __fp16 *outptr5 = outptr4 + ldout;
+        __fp16 *outptr6 = outptr5 + ldout;
+        __fp16 *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 12)
+        {
+            __fp16 dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 11) >= xmax)
+            {
+                for(int xi = 0; xi < 12; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Rows 0-1
+                    "LDR    q16, [%[outptr0]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr0], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr1]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr1], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr]]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #32]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #64]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #768]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #832]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #960]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr0]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr0]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr1]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr1]], #8\n"
+
+                    // Rows 2-3
+                    "LDR    q16, [%[outptr2]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr2], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr3]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr3], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #96]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #128]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #160]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #1024]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #1088]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr0], #64]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr1], #64]")
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr2]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr2]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr3]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr3]], #8\n"
+
+                    // Rows 4-5
+                    "LDR    q16, [%[outptr4]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr4], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr5]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr5], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #192]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #224]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #256]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr2], #64]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr3], #64]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr4], #88]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr4]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr4]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr5]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr5]], #8\n"
+
+                    // Rows 6-7
+                    "LDR    q16, [%[outptr6]]\n"
+                    "FCVTL2    v17.4s, v16.8h\n"
+                    "LDR    d18, [%[outptr6], #16]\n"
+                    "FCVTL    v16.4s, v16.4h\n"
+                    "LDR    q19, [%[outptr7]]\n"
+                    "FMUL    v17.4s, v17.4s, %[bv].4s\n"
+                    "LDR    d21, [%[outptr7], #16]\n"
+                    "FMUL    v16.4s, v16.4s, %[bv].4s\n"
+                    "LDP    q0,  q1,  [%[inptr], #288]\n"
+                    "FCVTL    v18.4s, v18.4h\n"
+                    "LDP    q2,  q3,  [%[inptr], #320]\n"
+                    "FCVTL2    v20.4s, v19.8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #352]\n"
+                    "FCVTL    v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr5], #64]") "FCVTL    v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr6], #88]") "FMUL    v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr7], #88]")
+                    "FMUL    v20.4s, v20.4s, %[bv].4s\n"
+                    "FMUL    v19.4s, v19.4s, %[bv].4s\n"
+                    "FMUL    v21.4s, v21.4s, %[bv].4s\n"
+                    "FMLA    v16.4s, v0.4s, %[av].4s\n"
+                    "FMLA    v17.4s, v1.4s, %[av].4s\n"
+                    "FCVTN    v16.4h, v16.4s\n"
+                    "FCVTN2    v16.8h, v17.4s\n"
+                    "FMLA    v18.4s, v2.4s, %[av].4s\n"
+                    "STR    q16, [%[outptr6]], #16\n"
+                    "FCVTN    v18.4h, v18.4s\n"
+                    "STR    d18, [%[outptr6]], #8\n"
+                    "FMLA    v19.4s, v3.4s, %[av].4s\n"
+                    "FMLA    v20.4s, v4.4s, %[av].4s\n"
+                    "FCVTN    v19.4h, v19.4s\n"
+                    "FCVTN2    v19.8h, v20.4s\n"
+                    "STR    q19, [%[outptr7]], #16\n"
+                    "FMLA    v21.4s, v5.4s, %[av].4s\n"
+                    "FCVTN    v21.4h, v21.4s\n"
+                    "STR    d21, [%[outptr7]], #8\n"
+                    "ADD    %[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [av] "w"(av), [bv] "w"(bv)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+            }
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
new file mode 100644
index 0000000..08cfc00
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+template <>
+inline void MergeResults<24, 8>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax,
+                                const int x0, const int xmax, const __fp16 alpha, const __fp16 beta)
+{
+    const __fp16 *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 48);
+
+    float16x8_t va = vdupq_n_f16(alpha);
+    float16x8_t vb = vdupq_n_f16(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        __fp16 *outptr0 = out + (y * ldout) + x0;
+        __fp16 *outptr1 = outptr0 + ldout;
+        __fp16 *outptr2 = outptr1 + ldout;
+        __fp16 *outptr3 = outptr2 + ldout;
+        __fp16 *outptr4 = outptr3 + ldout;
+        __fp16 *outptr5 = outptr4 + ldout;
+        __fp16 *outptr6 = outptr5 + ldout;
+        __fp16 *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 24)
+        {
+            __fp16 dummyres[24];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 23) >= xmax)
+            {
+                for(int xi = 0; xi < 24; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 192;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    ".arch    armv8.2-a+fp16\n"
+                    // Rows 0-1
+                    "LDP    q16, q17, [%[outptr0]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr0], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr1]]\n"
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #768]")
+                    "LDR    q21, [%[outptr1], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr]]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #32]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #64]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #832]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr0]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr0]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #896]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr1]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr1]], #16\n" ASM_PREFETCH("[%[inptr], #960]")
+
+                    // Rows 2-3
+                    "LDP    q16, q17, [%[outptr2]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr2], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr3]]\n"
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #1024]")
+                    "LDR    q21, [%[outptr3], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr], #96]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #128]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #160]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #1088]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr2]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr2]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr0], #80]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr3]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr3]], #16\n" ASM_PREFETCH("[%[outptr1], #80]")
+
+                    // Rows 4-5
+                    "LDP    q16, q17, [%[outptr4]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr4], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr5]]\n"
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[outptr2], #80]")
+                    "LDR    q21, [%[outptr5], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr], #192]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #224]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #256]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr3], #80]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr4]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr4]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr4], #80]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr5]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr5]], #16\n"
+
+                    // Rows 6-7
+                    "LDP    q16, q17, [%[outptr6]]\n"
+                    "FMUL    v16.8h, v16.8h, %[vb].8h\n"
+                    "LDR    q18, [%[outptr6], #32]\n"
+                    "FMUL    v17.8h, v17.8h, %[vb].8h\n"
+                    "LDP    q19, q20, [%[outptr7]]\n" ASM_PREFETCH("[%[outptr5], #80]")
+                    "FMUL    v18.8h, v18.8h, %[vb].8h\n"
+                    "LDR    q21, [%[outptr7], #32]\n"
+                    "FMUL    v19.8h, v19.8h, %[vb].8h\n"
+                    "LDP    q0,  q1,  [%[inptr], #288]\n"
+                    "FMUL    v20.8h, v20.8h, %[vb].8h\n"
+                    "LDP    q2,  q3,  [%[inptr], #320]\n"
+                    "FMUL    v21.8h, v21.8h, %[vb].8h\n"
+                    "LDP    q4,  q5,  [%[inptr], #352]\n"
+                    "FMLA    v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr6], #128]")
+                    "FMLA    v17.8h, v1.8h, %[va].8h\n"
+                    "STP    q16, q17, [%[outptr6]], #32\n"
+                    "FMLA    v18.8h, v2.8h, %[va].8h\n"
+                    "STR    q18, [%[outptr6]], #16\n"
+                    "FMLA    v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr7], #128]")
+                    "FMLA    v20.8h, v4.8h, %[va].8h\n"
+                    "STP    q19, q20, [%[outptr7]], #32\n"
+                    "FMLA    v21.8h, v5.8h, %[va].8h\n"
+                    "STR    q21, [%[outptr7]], #16\n"
+                    "ADD    %[inptr], %[inptr], #384\n"
+                    : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [va] "w"(va), [vb] "w"(vb)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+            }
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
new file mode 100644
index 0000000..dc247aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta)
+{
+    const int32_t *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    int32x4_t alpha_value = vdupq_n_s32(alpha);
+    int32x4_t beta_value  = vdupq_n_s32(beta);
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        int32_t *outptr0 = out + (y * ldout) + x0;
+        int32_t *outptr1 = outptr0 + ldout;
+        int32_t *outptr2 = outptr1 + ldout;
+        int32_t *outptr3 = outptr2 + ldout;
+        int32_t *outptr4 = outptr3 + ldout;
+        int32_t *outptr5 = outptr4 + ldout;
+        int32_t *outptr6 = outptr5 + ldout;
+        int32_t *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for(int i = x0; i < xmax; i += 12)
+        {
+            int32_t dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if((i + 11) >= xmax)
+            {
+                for(int xi = 0; xi < 12; xi++)
+                {
+                    if((i + xi) < xmax)
+                    {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            }
+            else
+            {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile(
+                    // Row 0
+                    ASM_PREFETCH("[%x[outptr1], #192]")
+                    "ldr q3, [%x[outptr0]]\n"
+                    "ldr q4, [%x[outptr0], #0x10]\n"
+                    "ldr q5, [%x[outptr0], #0x20]\n"
+                    "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr]]\n"
+                    "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x10]\n"
+                    "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x20]\n"
+                    "mla v3.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q0, [%x[outptr1]]\n"
+                    "mla v4.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q1, [%x[outptr1], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q2, [%x[outptr1], #0x20]\n"
+
+                    // Row 1
+                    ASM_PREFETCH("[%x[outptr2], #192]")
+                    "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x30]\n"
+                    "str q3, [%x[outptr0]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x40]\n"
+                    "str q4, [%x[outptr0]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x50]\n"
+                    "str q5, [%x[outptr0]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q3, [%x[outptr2]]\n"
+                    "mla v1.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q4, [%x[outptr2], #0x10]\n"
+                    "mla v2.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q5, [%x[outptr2], #0x20]\n"
+
+                    // Row 2
+                    ASM_PREFETCH("[%x[outptr3], #192]")
+                    "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x60]\n"
+                    "str q0, [%x[outptr1]], #0x10\n"
+                    "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x70]\n"
+                    "str q1, [%x[outptr1]], #0x10\n"
+                    "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x80]\n"
+                    "str q2, [%x[outptr1]], #0x10\n"
+                    "mla v3.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q0, [%x[outptr3]]\n"
+                    "mla v4.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q1, [%x[outptr3], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q2, [%x[outptr3], #0x20]\n"
+
+                    // Row 3
+                    ASM_PREFETCH("[%x[outptr4], #192]")
+                    "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x90]\n"
+                    "str q3, [%x[outptr2]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0xa0]\n"
+                    "str q4, [%x[outptr2]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0xb0]\n"
+                    "str q5, [%x[outptr2]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q3, [%x[outptr4]]\n"
+                    "mla v1.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q4, [%x[outptr4], #0x10]\n"
+                    "mla v2.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q5, [%x[outptr4], #0x20]\n"
+
+                    // Row 4
+                    ASM_PREFETCH("[%x[outptr5], #192]")
+                    "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0xc0]\n"
+                    "str q0, [%x[outptr3]], #0x10\n"
+                    "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0xd0]\n"
+                    "str q1, [%x[outptr3]], #0x10\n"
+                    "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0xe0]\n"
+                    "str q2, [%x[outptr3]], #0x10\n"
+                    "mla v3.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q0, [%x[outptr5]]\n"
+                    "mla v4.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q1, [%x[outptr5], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q2, [%x[outptr5], #0x20]\n"
+
+                    // Row 5
+                    ASM_PREFETCH("[%x[outptr6], #192]")
+                    "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0xf0]\n"
+                    "str q3, [%x[outptr4]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x100]\n"
+                    "str q4, [%x[outptr4]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x110]\n"
+                    "str q5, [%x[outptr4]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q3, [%x[outptr6]]\n"
+                    "mla v1.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q4, [%x[outptr6], #0x10]\n"
+                    "mla v2.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q5, [%x[outptr6], #0x20]\n"
+
+                    // Row 6
+                    ASM_PREFETCH("[%x[outptr7], #192]")
+                    "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x120]\n"
+                    "str q0, [%x[outptr5]], #0x10\n"
+                    "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x130]\n"
+                    "str q1, [%x[outptr5]], #0x10\n"
+                    "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x140]\n"
+                    "str q2, [%x[outptr5]], #0x10\n"
+                    "mla v3.4s, v6.4s, %[beta_value].4s\n"
+                    "ldr q0, [%x[outptr7]]\n"
+                    "mla v4.4s, v7.4s, %[beta_value].4s\n"
+                    "ldr q1, [%x[outptr7], #0x10]\n"
+                    "mla v5.4s, v8.4s, %[beta_value].4s\n"
+                    "ldr q2, [%x[outptr7], #0x20]\n"
+
+                    // Row 7
+                    "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+                    "ldr q6, [%x[inptr], #0x150]\n"
+                    "str q3, [%x[outptr6]], #0x10\n"
+                    "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+                    "ldr q7, [%x[inptr], #0x160]\n"
+                    "str q4, [%x[outptr6]], #0x10\n"
+                    "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+                    "ldr q8, [%x[inptr], #0x170]\n"
+                    "str q5, [%x[outptr6]], #0x10\n"
+                    "mla v0.4s, v6.4s, %[beta_value].4s\n"
+                    "mla v1.4s, v7.4s, %[beta_value].4s\n"
+                    "mla v2.4s, v8.4s, %[beta_value].4s\n"
+                    "str q0, [%x[outptr7]], #0x10\n"
+                    "str q1, [%x[outptr7]], #0x10\n"
+                    "str q2, [%x[outptr7]], #0x10\n"
+
+                    "add %x[inptr], %x[inptr], #0x180\n"
+                    : [outptr0] "+r"(outptr0),
+                    [outptr1] "+r"(outptr1),
+                    [outptr2] "+r"(outptr2),
+                    [outptr3] "+r"(outptr3),
+                    [outptr4] "+r"(outptr4),
+                    [outptr5] "+r"(outptr5),
+                    [outptr6] "+r"(outptr6),
+                    [outptr7] "+r"(outptr7),
+                    [inptr] "+r"(inptr)
+                    : [alpha_value] "w"(alpha_value),
+                    [beta_value] "w"(beta_value)
+                    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+            }
+        }
+    }
+}
+
+template <>
+inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta)
+{
+    // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
+    MergeResults<12, 8>(reinterpret_cast<int32_t *>(out), reinterpret_cast<const int32_t *>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
new file mode 100644
index 0000000..7d56e58
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "a32_merge_float_8x6.hpp"
+#include "a64_merge_float_12x8.hpp"
+#include "a64_merge_float_to_half_12x8.hpp"
+#include "a64_merge_half_24x8.hpp"
+#include "a64_merge_int32_12x8.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
new file mode 100644
index 0000000..b29cc58
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <newgemm_lib.hpp>
+
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+    int   fd = open("/proc/cpuinfo", 0);
+    char  buff[3000];
+    char *pos;
+    char *end;
+    int   foundid = 0;
+    int   variant = 0;
+
+    int cpu = sched_getcpu();
+
+    if(!fd)
+    {
+        return 0;
+    }
+
+    int charsread = read(fd, buff, 3000);
+    pos           = buff;
+    end           = buff + charsread;
+
+    close(fd);
+
+    /* So, to date I've encountered two formats for /proc/cpuinfo.
+     *
+     * One of them just lists processor : n  for each processor (with no
+     * other info), then at the end lists part information for the current
+     * CPU.
+     *
+     * The other has an entire clause (including part number info) for each
+     * CPU in the system, with "processor : n" headers.
+     *
+     * We can cope with either of these formats by waiting to see
+     * "processor: n" (where n = our CPU ID), and then looking for the next
+     * "CPU part" field.
+     */
+    while(pos < end)
+    {
+        if(foundid && !strncmp(pos, "CPU variant", 11))
+        {
+            pos += 13;
+            char *resume = end; // Need to continue scanning after this
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch    = '\0';
+                    resume = ch + 1;
+                    break;
+                }
+            }
+
+            variant = strtoul(pos, NULL, 0);
+
+            pos = resume;
+        }
+
+        if(foundid && !strncmp(pos, "CPU part", 8))
+        {
+            /* Found part number */
+            pos += 11;
+            unsigned int num;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            num = strtoul(pos, NULL, 0);
+
+            return (num << 4) | (variant << 20);
+        }
+
+        if(!strncmp(pos, "processor", 9))
+        {
+            /* Found processor ID, see if it's ours. */
+            pos += 11;
+            int num;
+
+            for(char *ch = pos; ch < end; ch++)
+            {
+                if(*ch == '\n')
+                {
+                    *ch = '\0';
+                    break;
+                }
+            }
+
+            num = strtol(pos, NULL, 0);
+
+            if(num == cpu)
+            {
+                foundid = 1;
+            }
+        }
+
+        while(pos < end)
+        {
+            char ch = *pos++;
+            if(ch == '\n' || ch == '\0')
+            {
+                break;
+            }
+        }
+    }
+#endif
+
+    return 0;
+}
+
+CPUInfo *get_CPUInfo()
+{
+    static CPUInfo ci;
+
+    return &ci;
+}
diff --git a/src/core/NEON/kernels/arm_gemm/profiler.hpp b/src/core/NEON/kernels/arm_gemm/profiler.hpp
new file mode 100644
index 0000000..c38b0a4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/profiler.hpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef CYCLE_PROFILING
+
+#include "../perf.h"
+
+#ifndef NO_MULTI_THREADING
+#include <mutex>
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+extern std::mutex report_mutex;
+#endif
+
+class profiler
+{
+private:
+    static const int maxevents         = 100000;
+    unsigned long    times[maxevents]  = {};
+    unsigned long    units[maxevents]  = {};
+    int              events[maxevents] = {};
+    int              currentevent      = 0;
+    int              countfd           = 0;
+
+public:
+    profiler()
+    {
+        countfd = open_cycle_counter();
+    }
+
+    ~profiler()
+    {
+        close(countfd);
+        int           tots[5];
+        unsigned long counts[5];
+        unsigned long tunits[5];
+        const char   *descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+
+        for(int i = 1; i < 5; i++)
+        {
+            tots[i]   = 0;
+            counts[i] = 0;
+            tunits[i] = 0;
+        }
+
+        for(int i = 0; i < currentevent; i++)
+        {
+            //            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+            tots[events[i]]++;
+            counts[events[i]] += times[i];
+            tunits[events[i]] += units[i];
+        }
+
+#ifdef NO_MULTI_THREADING
+        printf("Profiled events:\n");
+#else
+        std::lock_guard<std::mutex> lock(report_mutex);
+        printf("Profiled events (cpu %d):\n", sched_getcpu());
+#endif
+
+        printf("%20s  %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
+        for(int i = 1; i < 5; i++)
+        {
+            printf("%20s: %9d %9ld %9ld %12lu %9.2f\n", descs[i - 1], tots[i], counts[i], counts[i] / tots[i], tunits[i], (float)tunits[i] / counts[i]);
+        }
+    }
+
+    template <typename T>
+    void operator()(int i, unsigned long u, T func)
+    {
+        if(currentevent == maxevents)
+        {
+            func();
+        }
+        else
+        {
+            events[currentevent] = i;
+            units[currentevent]  = u;
+            start_counter(countfd);
+            func();
+            long long cycs        = stop_counter(countfd);
+            times[currentevent++] = cycs;
+        }
+    }
+};
+
+#else
+
+namespace arm_gemm
+{
+class profiler
+{
+public:
+    template <typename T>
+    void operator()(int i, unsigned long u, T func)
+    {
+        func();
+    }
+};
+
+#endif // CYCLE_PROFILING
+
+} // namespace arm_gemm
+
+#define PROFILE_PREPA 1
+#define PROFILE_PREPB 2
+#define PROFILE_KERNEL 3
+#define PROFILE_MERGE 4
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
new file mode 100644
index 0000000..c80bb59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
+struct TransformImpl
+{
+    template <typename TOut, typename TIn>
+    static void Transform(TOut *out, const TIn *const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax)
+    {
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders     = (ymax - y0) % IntBy;
+        const int n_y_blocks       = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders     = (xmax - x0) % BlockBy;
+        const int n_x_blocks       = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for(int y_block = 0; y_block < n_y_blocks; y_block++)
+        {
+            int fill_rows  = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            int blank_rows = IntBy - fill_rows;
+
+            int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for(int x_block = 0; x_block < n_x_blocks; x_block++)
+            {
+                int fill_cols  = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                int blank_cols = BlockBy - fill_cols;
+
+                int x_base = x0 + (x_block * BlockBy);
+
+                for(int row = 0; row < fill_rows; row++)
+                {
+                    for(int col = 0; col < fill_cols; col++)
+                    {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if(Transposed)
+                        {
+                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+                        }
+                        else
+                        {
+                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for(int col = 0; col < blank_cols; col++)
+                    {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                for(int row = 0; row < blank_rows; row++)
+                {
+                    for(int col = 0; col < (fill_cols + blank_cols); col++)
+                    {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    static inline void Transform(T *out, const T *const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax)
+    {
+        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+    }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
+void Transform(
+    TOut *out, const TIn *const in, const int stride,
+    const int k0, const int kmax, const int x0, const int xmax)
+{
+    // Redirect to a specialised implementation predicated on argument size.
+    TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+        out, in, stride, k0, kmax, x0, xmax);
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
new file mode 100644
index 0000000..f09e5a0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint32_t       *outptr = reinterpret_cast<uint32_t *>(out);
+    const uint32_t *inptr  = reinterpret_cast<const uint32_t *>(in);
+
+    uint32_t zerobuff[8];
+
+    for(int y = y0; y < ymax; y += 6)
+    {
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+
+        //prefetch_2x(inptr0);
+        //prefetch_2x(inptr1);
+        //prefetch_2x(inptr2);
+        //prefetch_2x(inptr3);
+        //prefetch_2x(inptr4);
+        //prefetch_2x(inptr5);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 5) >= ymax)
+            {
+                switch((y + 5) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 4:
+                        inptr1 = zerobuff;
+                    case 3:
+                        inptr2 = zerobuff;
+                    case 2:
+                        inptr3 = zerobuff;
+                    case 1:
+                        inptr4 = zerobuff;
+                    case 0:
+                        inptr5 = zerobuff;
+                    default:
+                        break;
+                }
+            }
+
+            __asm __volatile(
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "VLD1.32        {d0-d3}, [%[inptr0]]!\n"   // q0=A0A1A2A3
+                "VLD1.32        {d4-d7}, [%[inptr1]]!\n"   // q2=B0B1B2B3
+                "VLD1.32        {d8-d11}, [%[inptr2]]!\n"  // q4=C0C1C2C3
+                "VZIP.32    q0, q4\n"                      // q0=A0C0A1C1, q4 = A2C2A3C3
+                "VLD1.32        {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+                "VZIP.32    q2, q6\n"                      // q2=B0D0B1D1, q6 = B2D2B3D3
+                "VLD1.32        {d16-d19}, [%[inptr4]]!\n"
+                "VLD1.32        {d20-d23}, [%[inptr5]]!\n"
+                "VZIP.32    q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "VZIP.32    q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+                // Store first elements
+                "VST1.32        {d0-d1}, [%[outptr]]!\n"
+                "VST1.32        {d16}, [%[outptr]]!\n"
+
+                "VZIP.32    q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+                // Store second elements
+                "VST1.32        {d4-d5}, [%[outptr]]!\n"
+                "VZIP.32    q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "VST1.32        {d17}, [%[outptr]]!\n"
+                "VZIP.32    q3, q7\n"
+
+                // Store third elements
+                "VZIP.32    q9, q11\n"
+                "VST1.32        {d8-d9}, [%[outptr]]!\n"
+                "VZIP.32    q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]")
+                "VST1.32        {d20}, [%[outptr]]!\n"
+
+                // Store fourth elements
+                "VZIP.32    q5, q7\n"
+                "VST1.32        {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]")
+                "VST1.32        {d21}, [%[outptr]]!\n"
+
+                // Fifth
+                "VST1.32        {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "VST1.32        {d18}, [%[outptr]]!\n"
+
+                // Sixth
+                "VST1.32        {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "VST1.32        {d19}, [%[outptr]]!\n"
+
+                // Seventh
+                "VST1.32        {d10-d11}, [%[outptr]]!\n"
+                "VST1.32        {d22}, [%[outptr]]!\n"
+
+                // Eighth
+                "VST1.32        {d14-d15}, [%[outptr]]!\n"
+                "VST1.32        {d23}, [%[outptr]]!\n"
+
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr)
+                :
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+        }
+    }
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..ea32c96
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 8x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, true, 4, 4>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a 16x uint16_t specialisation
+    TransformImpl<16, 1, true, 2, 2>::Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a uint16_t specialisation
+    Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 16 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+    __asm volatile(
+        "VLD1.32    {d0-d3}, [%[in0]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        : [in0] "+r"(in0),
+        [out] "+r"(out)
+        :
+        : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+    __asm volatile(
+        "VLD1.32    {d0-d3}, [%[in0]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+        "VLD1.32    {d0-d3}, [%[in1]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB    %[out], %[out], #32\n"
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [out] "+r"(out)
+        :
+        : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+    __asm __volatile(
+        "VLD1.32    {d0-d3}, [%[in0]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+        "VLD1.32    {d0-d3}, [%[in1]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]")
+        "VLD1.32    {d0-d3}, [%[in2]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]")
+        "VLD1.32    {d0-d3}, [%[in3]]!\n"
+        "VST1.32    {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB    %[out], %[out], #96\n"
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [in2] "+r"(in2),
+        [in3] "+r"(in3),
+        [out] "+r"(out)
+        :
+        : "q0", "q1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+    uint16_t *out, const uint16_t *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
new file mode 100644
index 0000000..8d61f15
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+#include "../utils.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint8_t       *outptr = (uint8_t *)out;
+    const uint8_t *inptr  = (uint8_t *)in;
+
+    uint8_t zerobuff[16];
+
+    for(int y = y0; y < ymax; y += 4)
+    {
+        const uint8_t *inptr0 = inptr + y * ldin + k0;
+        const uint8_t *inptr1 = inptr0 + ldin;
+        const uint8_t *inptr2 = inptr1 + ldin;
+        const uint8_t *inptr3 = inptr2 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+
+        int x = (kmax - k0);
+        for(; x > 15; x -= 16)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 3) >= ymax)
+            {
+                switch((y + 3) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 2:
+                        inptr1 = zerobuff;
+                    case 1:
+                        inptr2 = zerobuff;
+                    case 0:
+                        inptr3 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                "LDR    q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR    q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]")
+                "STP    q0, q1, [%[outptr]], #32\n"
+                "LDR    q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR    q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP    q0, q1, [%[outptr]], #32\n"
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [outptr] "+r"(outptr)
+                :
+                : "v0", "v1");
+        }
+
+        if(x > 0)
+        {
+            /* Need to duplicate this here, in case we didn't run the main loop. */
+            if((y + 3) >= ymax)
+            {
+                switch((y + 3) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 2:
+                        inptr1 = zerobuff;
+                    case 1:
+                        inptr2 = zerobuff;
+                    case 0:
+                        inptr3 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
+            auto f = [&outptr, x](const uint8_t *&p)
+            {
+                for(int i = 0; i < 16; i++)
+                {
+                    if(i < x)
+                    {
+                        *outptr++ = *p++;
+                    }
+                    else
+                    {
+                        *outptr++ = 0;
+                    }
+                }
+            };
+
+            f(inptr0);
+            f(inptr1);
+            f(inptr2);
+            f(inptr3);
+        }
+    }
+}
+
+#endif // __aarch64__
\ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
new file mode 100644
index 0000000..3cbc881
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint16_t       *outptr = (uint16_t *)out;
+    const uint16_t *inptr  = (const uint16_t *)in;
+
+    uint16_t zerobuff[24];
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        const uint16_t *inptr0 = inptr + y * ldin + k0;
+        const uint16_t *inptr1 = inptr0 + ldin;
+        const uint16_t *inptr2 = inptr1 + ldin;
+        const uint16_t *inptr3 = inptr2 + ldin;
+        const uint16_t *inptr4 = inptr3 + ldin;
+        const uint16_t *inptr5 = inptr4 + ldin;
+        const uint16_t *inptr6 = inptr5 + ldin;
+        const uint16_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            int skippf = (x & 31);
+            __asm __volatile(
+                // Load up 8 elements (1 vector) from each of 8 sources.
+                "CBNZ    %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]")
+                ASM_PREFETCH("[%[inptr1], #128]")
+                ASM_PREFETCH("[%[inptr2], #128]")
+                ASM_PREFETCH("[%[inptr3], #128]")
+                "1:\n"
+
+                "LDR    q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
+                "LDR    q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
+                "LDR    q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
+                "LDR    q6, [%[inptr6]], #16\n"
+                "ZIP1    v8.8h, v0.8h, v4.8h\n"  // q8=A0E0A1E1A2E2A3E3
+                "ZIP2    v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
+                "ZIP1    v9.8h, v2.8h, v6.8h\n"  // q9=C0G0C1G1C2G2C3G3
+                "ZIP2    v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
+                "LDR    q1, [%[inptr1]], #16\n"  // q1=B0B1B2B3B4B5B6B7
+                "LDR    q5, [%[inptr5]], #16\n"
+                "LDR    q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
+                "LDR    q7, [%[inptr7]], #16\n"
+                "ZIP1    v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
+                "ZIP2    v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
+                "ZIP1    v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
+                "ZIP2    v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
+
+                "ZIP1    v12.8h,  v8.8h,  v9.8h\n" // q20=A0C0E0G0A1C1E1G1
+                "ZIP2    v20.8h,  v8.8h,  v9.8h\n"
+                "ZIP1    v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
+                "ZIP2    v21.8h, v10.8h, v11.8h\n"
+
+                "CBNZ    %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]")
+                ASM_PREFETCH("[%[inptr5], #112]")
+                ASM_PREFETCH("[%[inptr6], #112]")
+                ASM_PREFETCH("[%[inptr7], #112]")
+                "2:\n"
+
+                "ZIP1    v22.8h, v16.8h, v17.8h\n"
+                "ZIP2    v30.8h, v16.8h, v17.8h\n"
+                "ZIP1    v23.8h, v18.8h, v19.8h\n"
+                "ZIP2    v31.8h, v18.8h, v19.8h\n"
+
+                "ZIP1    v14.8h, v12.8h, v13.8h\n"    // q22=A0B0C0D0E0F0G0H0
+                "ZIP2    v15.8h, v12.8h, v13.8h\n"    // q23=A1B1C1D1E1F1G1H1
+                "STP    q14, q15, [%[outptr]], #32\n" // Write back first two elements
+
+                "ZIP1    v0.8h, v20.8h, v21.8h\n"
+                "ZIP2    v1.8h, v20.8h, v21.8h\n"
+                "STP    q0, q1, [%[outptr]], #32\n" // Write back next two elements
+
+                "ZIP1    v2.8h, v22.8h, v23.8h\n"
+                "ZIP2    v3.8h, v22.8h, v23.8h\n"
+                "STP    q2, q3, [%[outptr]], #32\n" // Write back next two elements
+
+                "ZIP1    v4.8h, v30.8h, v31.8h\n"
+                "ZIP2    v5.8h, v30.8h, v31.8h\n"
+                "STP    q4, q5, [%[outptr]], #32\n" // Write back last two elements
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                : [skippf] "r"(skippf)
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+                "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..47e4fa2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint32_t       *outptr = (uint32_t *)out;
+    const uint32_t *inptr  = (uint32_t *)in;
+
+    uint32_t zerobuff[8];
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+        const uint32_t *inptr6 = inptr5 + ldin;
+        const uint32_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
+                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
+                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
+                "ZIP1       v16.4s, v0.4s, v4.4s\n"     // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
+                "ZIP1       v17.4s, v2.4s, v6.4s\n"     // q17=B0D0B1D1
+                "LDP        q8, q9, [%[inptr4]], #32\n"
+                "LDP        q10, q11, [%[inptr5]], #32\n"
+                "LDP        q12, q13, [%[inptr6]], #32\n"
+                "ZIP1       v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "LDP        q14, q15, [%[inptr7]], #32\n"
+                "ZIP1       v19.4s, v10.4s, v14.4s\n"
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2       v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+                "ZIP2       v17.4s, v2.4s, v6.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+                "ZIP2       v18.4s, v8.4s, v12.4s\n"
+                "ZIP2       v19.4s, v10.4s, v14.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP1       v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP1       v17.4s, v3.4s, v7.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Third element
+
+                "ZIP1       v18.4s, v9.4s, v13.4s\n"
+                "ZIP1       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2       v16.4s, v1.4s, v5.4s\n"
+                "ZIP2       v17.4s, v3.4s, v7.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
+
+                "ZIP2       v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]")
+                "ZIP2       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
+
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
new file mode 100644
index 0000000..85ffdc2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <>
+inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    float        *outptr = out;
+    const __fp16 *inptr  = in;
+
+    __fp16 zerobuff[8];
+
+    for(int y = y0; y < ymax; y += 8)
+    {
+        const __fp16 *inptr0 = inptr + y * ldin + k0;
+        const __fp16 *inptr1 = inptr0 + ldin;
+        const __fp16 *inptr2 = inptr1 + ldin;
+        const __fp16 *inptr3 = inptr2 + ldin;
+        const __fp16 *inptr4 = inptr3 + ldin;
+        const __fp16 *inptr5 = inptr4 + ldin;
+        const __fp16 *inptr6 = inptr5 + ldin;
+        const __fp16 *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x = (kmax - k0);
+        for(; x > 7; x -= 8)
+        {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if((y + 7) >= ymax)
+            {
+                switch((y + 7) - ymax)
+                {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            __asm __volatile(
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "LDR    q0, [%[inptr0]], #16\n"
+                "LDR    q2, [%[inptr1]], #16\n"
+                "FCVTL2    v1.4s, v0.8h\n"
+                "FCVTL    v0.4s, v0.4h\n"
+                "LDR    q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "FCVTL2    v3.4s, v2.8h\n"
+                "FCVTL    v2.4s, v2.4h\n"
+                "FCVTL2    v5.4s, v4.8h\n"
+                "FCVTL    v4.4s, v4.4h\n"
+                "ZIP1    v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDR    q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "FCVTL2    v7.4s, v6.8h\n"
+                "FCVTL    v6.4s, v6.4h\n"
+                "ZIP1    v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+                "LDR    q8, [%[inptr4]], #16\n"
+                "LDR    q10, [%[inptr5]], #16\n"
+                "FCVTL2    v9.4s, v8.8h\n"
+                "FCVTL    v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR    q12, [%[inptr6]], #16\n"
+                "FCVTL2    v11.4s, v10.8h\n"
+                "FCVTL    v10.4s, v10.4h\n"
+                "FCVTL2    v13.4s, v12.8h\n"
+                "FCVTL    v12.4s, v12.4h\n"
+                "ZIP1    v18.4s, v8.4s, v12.4s\n"
+                "LDR    q14, [%[inptr7]], #16\n"
+                "FCVTL2    v15.4s, v14.8h\n"
+                "FCVTL    v14.4s, v14.4h\n"
+                "ZIP1    v19.4s, v10.4s, v14.4s\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1    v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                "ZIP1    v21.4s, v18.4s, v19.4s\n"
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2    v16.4s, v0.4s, v4.4s\n"
+                "ZIP2    v17.4s, v2.4s, v6.4s\n"
+                "STP    q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+                "ZIP2    v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2    v19.4s, v10.4s, v14.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1    v20.4s, v16.4s, v17.4s\n"
+                "ZIP1    v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP1    v16.4s, v1.4s, v5.4s\n"
+                "ZIP1    v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+                "STP    q20, q21, [%[outptr]], #32\n" // Third element
+
+                "ZIP1    v18.4s, v9.4s, v13.4s\n"
+                "ZIP1    v19.4s, v11.4s, v15.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Fourth element
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1    v20.4s, v16.4s, v17.4s\n"
+                "ZIP1    v21.4s, v18.4s, v19.4s\n"
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2    v16.4s, v1.4s, v5.4s\n"
+                "ZIP2    v17.4s, v3.4s, v7.4s\n"
+                "STP    q20, q21, [%[outptr]], #32\n" // Fifth element
+
+                "ZIP2    v18.4s, v9.4s, v13.4s\n"
+                "ZIP2    v19.4s, v11.4s, v15.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Sixth element
+
+                "ZIP1    v20.4s, v16.4s, v17.4s\n"
+                "ZIP1    v21.4s, v18.4s, v19.4s\n"
+                "STP    q20, q21, [%[outptr]], #32\n" // Seventh element
+
+                "ZIP2    v22.4s, v16.4s, v17.4s\n"
+                "ZIP2    v23.4s, v18.4s, v19.4s\n"
+                "STP    q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+                [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+        }
+
+        for(; x > 0; x--)
+        {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
new file mode 100644
index 0000000..fd6a253
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 6x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, true, 4, 4>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a 12 x uint16_t specialisation
+    TransformImpl<12, 1, true, 2, 2>::Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a uint16_t specialisation
+    Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 12 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+    __asm volatile(
+        "LDR q0, [%[in0]]\n"
+        "STR q0, [%[out]]\n"
+        "LDR d1, [%[in0], #0x10]\n"
+        "STR d1, [%[out], #0x10]\n"
+        "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+        : [in0] "+r"(in0),
+        [out] "+r"(out)
+        :
+        : "v0", "v1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+    __asm volatile(
+        "LDR q0, [%[in0]]\n"
+        "LDR d1, [%[in0], #0x10]\n"
+        "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+
+        "LDR x21, [%[in1]]\n"
+        "LDR q2, [%[in1], #0x08]\n"
+        "INS v1.d[1], x21\n"
+        "ADD %x[in1], %x[in1], #0x18\n"
+        "STP q0, q1, [%[out]]\n"
+        "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]")
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [out] "+r"(out)
+        :
+        : "x21", "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+    __asm __volatile(
+        "LDR q0, [%x[in0]], #0x10\n"
+        "STR q0, [%x[out]]\n"
+        "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]")
+        "STR d1, [%x[out], #0x10]\n"
+
+        "LDR q0, [%x[in1]], #0x10\n"
+        "STR q0, [%x[out], #0x18]\n"
+        "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]")
+        "STR d1, [%x[out], #0x28]\n"
+
+        "LDR q0, [%x[in2]], #0x10\n"
+        "STR q0, [%x[out], #0x30]\n"
+        "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]")
+        "STR d1, [%x[out], #0x40]\n"
+
+        "LDR q0, [%x[in3]], #0x10\n"
+        "STR q0, [%x[out], #0x48]\n"
+        "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n"
+        : [in0] "+r"(in0),
+        [in1] "+r"(in1),
+        [in2] "+r"(in2),
+        [in3] "+r"(in3),
+        [out] "+r"(out)
+        :
+        : "v0", "v1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+    uint16_t *out, const uint16_t *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
new file mode 100644
index 0000000..ff1cbfb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include "transpose_interleave_common.hpp"
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out)
+{
+    __asm __volatile(
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2    v1.4s, v0.8h\n"
+        "FCVTL    v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    d2, [%[in0]], #8\n"
+        "FCVTL    v2.4s, v2.4h\n"
+        "STR    q2, [%[out], #32]\n"
+        : [in0] "+r"(in0), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out)
+{
+    __asm __volatile(
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2    v1.4s, v0.8h\n"
+        "FCVTL    v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    d2, [%[in0]], #8\n"
+        "FCVTL    v2.4s, v2.4h\n"
+        "LDR    q3, [%[in1]], #16\n"
+        "FCVTL2    v4.4s, v3.8h\n"
+        "FCVTL    v3.4s, v3.4h\n"
+        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+        "LDR    d5, [%[in1]], #16\n"
+        "FCVTL    v5.4s, v5.4h\n"
+        "STP    q4, q5, [%[out], #64]\n"
+        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out)
+{
+    __asm __volatile(
+        "LDR    q0, [%[in0]], #16\n"
+        "FCVTL2    v1.4s, v0.8h\n"
+        "FCVTL    v0.4s, v0.4h\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]")
+        "FCVTL    v2.4s, v2.4h\n"
+        "LDR    q3, [%[in1]], #16\n"
+        "FCVTL2    v4.4s, v3.8h\n"
+        "FCVTL    v3.4s, v3.4h\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR    d5, [%[in1]], #8\n"
+        "FCVTL    v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDR    q6, [%[in2]], #16\n"
+        "FCVTL2    v7.4s, v6.8h\n"
+        "FCVTL    v6.4s, v6.4h\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR    d8, [%[in2]], #8\n"
+        "FCVTL    v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]")
+        "LDR    q9, [%[in3]], #16\n"
+        "FCVTL2    v10.4s, v9.8h\n"
+        "FCVTL    v9.4s, v9.4h\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR    d11, [%[in3]], #8\n"
+        "FCVTL    v11.4s, v11.4h\n"
+        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 4, 2>::Transform(
+    float *out, const __fp16 *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
new file mode 100644
index 0000000..5434599
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 12x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 4, 4>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a 24 x uint16_t specialisation
+    TransformImpl<24, 1, true, 2, 2>::Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 24x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+    T *out, const T *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    // Redirect to a uint16_t specialisation
+    Transform(
+        reinterpret_cast<uint16_t *>(out),
+        reinterpret_cast<const uint16_t *const>(in),
+        stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 24 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+    __asm __volatile(
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "STR    q2, [%[out], #32]\n"
+        : [in0] "+r"(in0), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+    __asm __volatile(
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "LDP    q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+        "LDR    q5, [%[in1]], #16\n"
+        "STP    q4, q5, [%[out], #64]\n"
+        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+    __asm __volatile(
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]")
+        "LDP    q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR    q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDP    q6, q7, [%[in2]], #32\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR    q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]")
+        "LDP    q9, q10, [%[in3]], #32\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR    q11, [%[in3]], #16\n"
+        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+        :
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<24, 1, true, 2, 2>::Transform(
+    uint16_t *out, const uint16_t *const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax)
+{
+    TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
new file mode 100644
index 0000000..8ad5b85
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "a32_interleave_6way_32bit.hpp"
+#include "a32_transpose_interleave_8way_32bit.hpp"
+#include "a64_block16_interleave4_8bit.hpp"
+#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
+#include "a64_interleave_8way_half_to_float.hpp"
+#include "a64_transpose_interleave_12way_16bit.hpp"
+#include "a64_transpose_interleave_12way_half_to_float.hpp"
+#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "transpose_interleave_common.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000..3218ca1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon
+{
+    // Override the moveblock_1xY methods to improve performance
+    static inline void moveblock_1x1(const TIn *&in0, TOut *out)
+    {
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in0++);
+        }
+    }
+
+    static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out)
+    {
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in0++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in1++);
+        }
+    }
+
+    static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out)
+    {
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in0++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in1++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in2++);
+        }
+        for(unsigned int i = 0; i < IntBy; i++)
+        {
+            *out++ = static_cast<TOut>(*in3++);
+        }
+    }
+
+    static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax)
+    {
+        const auto ldin = stride;
+
+        TOut      *outarray    = out;
+        const TIn *inarray     = in;
+        TOut      *outptr_base = outarray;
+        const TIn *inptr_base  = inarray + x0 + (k0 * ldin);
+        int        ldout       = (kmax - k0) * IntBy;
+
+        int k = (kmax - k0);
+        for(; k > 3; k -= 4)
+        {
+            TOut      *outptr = outptr_base;
+            const TIn *inptr  = inptr_base;
+            const TIn *inptr1 = inptr + ldin;
+            const TIn *inptr2 = inptr1 + ldin;
+            const TIn *inptr3 = inptr2 + ldin;
+
+            prefetch_3x(inptr);
+            prefetch_3x(inptr1);
+            prefetch_3x(inptr2);
+            prefetch_3x(inptr3);
+
+            outptr_base += IntBy * 4;
+            inptr_base += ldin * 4;
+
+            for(int x = (xmax - x0) / IntBy; x > 0; x--)
+            {
+                moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+                outptr += ldout;
+            }
+        }
+
+        if(k)
+        {
+            TOut      *outptr = outptr_base;
+            const TIn *inptr  = inptr_base;
+            const TIn *inptr1 = inptr + ldin;
+            const TIn *inptr2 = inptr1 + ldin;
+
+            prefetch_3x(inptr);
+            prefetch_3x(inptr1);
+            prefetch_3x(inptr2);
+
+            for(int x = (xmax - x0) / IntBy; x > 0; x--)
+            {
+                switch(k)
+                {
+                    case 3:
+                        moveblock_1x2(inptr, inptr1, outptr);
+                        moveblock_1x1(inptr2, outptr + IntBy * 2);
+                        break;
+
+                    case 2:
+                        moveblock_1x2(inptr, inptr1, outptr);
+                        break;
+
+                    case 1:
+                        moveblock_1x1(inptr, outptr);
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+
+                outptr += ldout;
+            }
+        }
+
+        // Cope with ragged X cases
+        const unsigned int overflow = (xmax - x0) % IntBy;
+        if(overflow)
+        {
+            const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+            TOut      *outptr     = outarray + ((xmax - x0) / IntBy) * ldout;
+
+            for(int k = (kmax - k0); k > 0; k--)
+            {
+                const TIn *inptr = inptr_base;
+                inptr_base += ldin;
+
+                for(unsigned int x = 0; x < IntBy; x++)
+                {
+                    TOut val  = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+                    *outptr++ = val;
+                }
+            }
+        }
+    }
+};
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
new file mode 100644
index 0000000..6c5b92a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+// Macro for unreachable code (e.g. impossible default cases on switch)
+#define UNREACHABLE(why) __builtin_unreachable()
+
+// Paranoid option for the above with assert
+// #define UNREACHABLE(why)   assert(0 && why)
+
+inline int iceildiv(const int a, const int b)
+{
+    return (a + b - 1) / b;
+}
+
+template <typename T>
+inline T roundup(const T a, const T b)
+{
+    T rem = a % b;
+
+    if(rem)
+    {
+        return a + b - rem;
+    }
+    else
+    {
+        return a;
+    }
+}