COMPMID-481: Add AArch64 GEMM

Change-Id: I34f94f99cb05f0eabafee13c5e623ee779b72360
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/83741
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
new file mode 100644
index 0000000..fa1d6e3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+// Macro to use in assembler to get a preload.  Needed because of various
+// workarounds needed to get working preload behaviour.
+//
+// Code using these macros needs to clobber x20 and x21 as they might be
+// used by the workaround.
+
+#define ASM_PREFETCH(address)    "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address)  "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address)   "PRFM PSTL1KEEP, " address "\n"
+#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+
+#else
+
+#define ASM_PREFETCH(address)     "PLD " address "\n"
+#define ASM_PREFETCHW(address)    "PLDW " address "\n"
+
+#endif
+
+/*
+ * Do some prefetches.
+ */
+template <typename T>
+static inline void prefetch_6x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+        ASM_PREFETCH("[%[pfp], #320]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_5x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+        ASM_PREFETCH("[%[pfp], #256]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_4x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+        ASM_PREFETCH("[%[pfp], #192]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_3x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+        ASM_PREFETCH("[%[pfp], #128]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_2x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+        ASM_PREFETCH("[%[pfp], #64]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
+
+template <typename T>
+static inline void prefetch_1x(const T *pfp) {
+    __asm __volatile (
+        ASM_PREFETCH("[%[pfp]]")
+    :
+    : [pfp] "r" (pfp)
+    : "memory"
+    );
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
new file mode 100644
index 0000000..0097443
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+// Abstract class for a GEMM function
+template<typename To, typename Tr>
+class GemmCommon {
+public:
+    virtual size_t get_working_size() const = 0;
+    virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space = NULL) const = 0;
+    virtual ~GemmCommon() { }
+};
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
new file mode 100644
index 0000000..a186d88
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "gemm_common.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+#include "mergeresults.hpp"
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND	64
+#define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    const unsigned int M;
+    const unsigned int N;
+    const unsigned int K;
+
+    const bool trA;
+    const bool trB;
+
+    const strategy strat;
+
+    unsigned int k_block = 0;
+    unsigned int x_block = 0;
+    unsigned int Mround = 0;
+
+    size_t get_a_working_size() const {
+        return ROUND_UP(sizeof(Toi) * k_block * Mround);
+    }
+
+    size_t get_b_working_size() const {
+        return ROUND_UP(sizeof(Toi) * x_block * k_block);
+    }
+
+    size_t get_c_working_size() const {
+        return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
+    }
+
+public:
+    size_t get_working_size() const override {
+        return get_a_working_size() + get_b_working_size() + get_c_working_size();
+    }
+
+    GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
+        const unsigned int L1_size = ci->L1_size;
+        const unsigned int L2_size = ci->L2_size;
+
+        // Work out blocking parameters
+        // k_block: Each iteration will consume (out_width + out_height)
+        // operands - so how many iterations will fill the L1?
+        k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));
+
+        // Needs to be a multiple of the K unroll level.
+        k_block /= strat.k_unroll;
+        k_block *= strat.k_unroll;
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        int num_k_blocks = (K + (k_block - 1)) / k_block;
+
+        // So divide the space equally into that many blocks.
+        k_block = (K + num_k_blocks - 1) / num_k_blocks;
+
+        // And round UP to the K unroll level required.
+        k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
+        k_block *= strat.k_unroll;
+
+        // x_block: Work out how many rows (of length k_block) will fit in the L2
+        x_block = L2_size / (sizeof(Toi) * k_block);
+
+        // Needs to be a multiple of the kernel output width.
+        x_block /= strat.out_width;
+        x_block *= strat.out_width;
+
+        // And tune to the presented problem size.
+        int num_x_blocks = (N + (x_block - 1)) / x_block;
+        x_block = (N + num_x_blocks - 1) / num_x_blocks;
+
+        x_block = (x_block + strat.out_width - 1) / strat.out_width;
+        x_block *= strat.out_width;
+
+        // Work out the rounded size of M - needed for some buffers.
+        Mround = (M + (strat.out_height - 1)) / strat.out_height;
+        Mround *= strat.out_height;
+    }
+
+    // Actually execute the GEMM.
+    void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
+        profiler prof;
+
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+        intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
+        size_t diff = 0;
+
+        if (working_space_int & 0xF) {
+            diff = 0x10 - (working_space_int & 0xF);
+        }
+
+        // TODO: Multithreaded implementations could share the burden of transforming these blocks.
+        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
+        Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
+        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
+
+        for (unsigned int k0=0; k0<K; k0 += k_block) {
+            unsigned int kmax = k0 + k_block;
+            if (kmax > K) kmax = K;
+
+            // Figure out how many "K" the kernel will actually process.
+            int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
+            kern_k *= strat.k_unroll;
+
+            prof(PROFILE_PREPA, [&](void) {
+                if (trA ^ strategy::A_transpose) {
+                    Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
+                } else {
+                    Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
+                }
+            });
+
+            for (unsigned int x0=0; x0<N; x0 += x_block) {
+                unsigned int xmax = x0 + x_block;
+                if (xmax > N) xmax = N;
+
+                int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
+
+                prof(PROFILE_PREPB, [&](void) {
+                    if (trB ^ strategy::B_transpose) {
+                        Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
+                    } else {
+                        Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
+                    }
+                });
+
+                for (unsigned int y=0; y<M; y+=strat.out_height) {
+                    unsigned int ymax = y + strat.out_height;
+                    if (ymax > M) ymax = M;
+
+                    prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
+                    prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
+                }
+            }
+        }
+    }
+};
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
new file mode 100644
index 0000000..e229e21
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Actual kernel implementations
+#include "a64_sgemm_12x8/generic.hpp"
+#include "a64_sgemm_12x8/a53.hpp"
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics.  The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_12x8 {
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Describes the data layout for A input */
+    static const int A_interleave = 8;
+    static const int A_block = 1;
+    static const int A_transpose = 0;
+
+    /* Same for B input */
+    static const int B_interleave = 12;
+    static const int B_block = 1;
+    static const int B_transpose = 1;
+
+    /* Kernel blocking parameters */
+    static const int out_width = 12;
+    static const int out_height = 8;
+    static const int k_unroll = 1;
+
+    kern_type kernel{nullptr};
+
+    sgemm_12x8(const CPUInfo *ci) {
+        kernel = a64_sgemm_asimd_12x8;
+        if (ci->CPU == CPUTarget::A53) {
+            kernel = a64_sgemm_asimd_12x8_a53;
+        }
+    }
+};
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
new file mode 100644
index 0000000..e58ce66
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                "1:\n"
+                // Unroll 0
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "nop\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Unroll 1
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ldr	%d[a0], [%[a_ptr], #64]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "ldr	x20, [%[a_ptr], #72]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+                "ldr	%d[a1], [%[a_ptr], #80]\n"
+                "ins	%[a0].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "ldr	x20, [%[a_ptr], #88]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #96]\n"
+                "ins	%[a1].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "ldr	x20, [%[b_ptr], #104]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+                "nop\n"
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+                "nop\n"
+                "nop\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #112]\n"
+                "nop\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "ldr	x20, [%[b_ptr], #120]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+
+                "nop\n"
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+                "bne	1b\n"
+
+                // Branch here if K=1 or 2.  Do the right thing for odd/even at the end.
+                "4:\n"
+                "cbnz	%[oddk], 2f\n"
+
+                // Detached final iteration. (even K)
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ldr	%d[a0a], [%[a_ptr], #32]\n"
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	x20, [%[a_ptr], #40]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+                "ldr	%d[a1a], [%[a_ptr], #48]\n"
+                "ins	%[a0a].d[1], x20\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "ldr	x20, [%[a_ptr], #56]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+                "ldr	%d[b0], [%[b_ptr], #48]\n"
+                "ins	%[a1a].d[1], x20\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "ldr	x20, [%[b_ptr], #56]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+                "ins	%[b0].d[1], x20\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+                "nop\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+                "ldr	%d[b1], [%[b_ptr], #64]\n"
+                "nop\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "ldr	x20, [%[b_ptr], #72]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+                "ins	%[b1].d[1], x20\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                "ldr	%d[b2], [%[b_ptr], #80]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "ldr	x20, [%[b_ptr], #88]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "b	3f\n"
+
+                // Detached final iteration. (odd K)
+                "2:\n"
+                "ldr	%d[b2], [%[b_ptr], #32]\n"
+                "nop\n"
+                "fmla	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	x20, [%[b_ptr], #40]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+                "ins	%[b2].d[1], x20\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q8,  [%[c_ptr]]\n"
+                "str	q16,  [%[c_ptr], #16]\n"
+                "str	q24,  [%[c_ptr], #32]\n"
+                "str	q9,  [%[c_ptr], #48]\n"
+                "str	q17,  [%[c_ptr], #64]\n"
+                "str	q25,  [%[c_ptr], #80]\n"
+                "str	q10,  [%[c_ptr], #96]\n"
+                "str	q18,  [%[c_ptr], #112]\n"
+                "str	q26,  [%[c_ptr], #128]\n"
+                "str	q11,  [%[c_ptr], #144]\n"
+                "str	q19,  [%[c_ptr], #160]\n"
+                "str	q27,  [%[c_ptr], #176]\n"
+                "str	q12,  [%[c_ptr], #192]\n"
+                "str	q20,  [%[c_ptr], #208]\n"
+                "str	q28,  [%[c_ptr], #224]\n"
+                "str	q13,  [%[c_ptr], #240]\n"
+                "str	q21,  [%[c_ptr], #256]\n"
+                "str	q29,  [%[c_ptr], #272]\n"
+                "str	q14,  [%[c_ptr], #288]\n"
+                "str	q22,  [%[c_ptr], #304]\n"
+                "str	q30,  [%[c_ptr], #320]\n"
+                "str	q15,  [%[c_ptr], #336]\n"
+                "str	q23,  [%[c_ptr], #352]\n"
+                "str	q31,  [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+        }
+    }
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
new file mode 100644
index 0000000..082c200
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <arm_neon.h>
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+            register float32x4_t a0a asm("v5");
+            register float32x4_t a1a asm("v6");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla  	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "ldr	%q[a0], [%[a_ptr], #64]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "ldr	%q[a1], [%[a_ptr], #80]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #32]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "ldr	%q[a1a], [%[a_ptr], #48]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                "fmla 	v8.4s , %[b0].4s, %[a0a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[block_jump]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0a].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla   v9.4s , %[b0].4s, %[a0a].s[1]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0a].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0a].s[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                "fmla	v25.4s, %[b2].4s, %[a0a].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0a].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0a].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0a].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0a].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0a].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0a].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "fmla 	v12.4s, %[b0].4s, %[a1a].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1a].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1a].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1a].s[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1a].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1a].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "fmla	v14.4s, %[b0].4s, %[a1a].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1a].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1a].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1a].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1a].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1a].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], %[row_jump]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+            );
+        }
+    }
+}
+
+inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
new file mode 100644
index 0000000..6731480
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template<unsigned int width, unsigned int height, typename Tin, typename Tout>
+void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) {
+    int full_y_blocks = (ymax - y0) / height;
+    int y_remainder = (ymax - y0) % height;
+    int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
+
+    int full_x_blocks = (xmax - x0) / width;
+    int x_remainder = (xmax - x0) % width;
+    int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
+
+    for (int y_block = 0; y_block < y_blocks; y_block++) {
+        int ybase = y0 + (y_block * height);
+
+        int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
+
+        for (int x_block = 0; x_block < x_blocks; x_block++) {
+            int xbase = x0 + (x_block * width);
+
+            int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
+
+            for (int row=0; row < fill_rows; row++) {
+                for (int col=0; col < fill_cols; col++) {
+                    Tout &p = out[(ybase + row) * ldc + xbase + col];
+
+                    p = (p * alpha) + (beta * in[row * width + col]);
+                }
+            }
+
+            in += (width * height);
+        }
+    }
+}
+
+#include "merges/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000..f2c5fd8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../asmlib.hpp"
+
+template<>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
+    const float *inptr = in;
+    prefetch_6x(inptr);
+    prefetch_6x(inptr + 96);
+
+    float32x4_t av = vdupq_n_f32(alpha);
+    float32x4_t bv = vdupq_n_f32(beta);
+
+    for (int y=y0; y<ymax; y+=8) {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        prefetch_2x(outptr0);
+        prefetch_2x(outptr1);
+        prefetch_2x(outptr2);
+        prefetch_2x(outptr3);
+        prefetch_2x(outptr4);
+        prefetch_2x(outptr5);
+        prefetch_2x(outptr6);
+        prefetch_2x(outptr7);
+
+        for (int i=x0; i<xmax; i+=12) {
+            float dummyres[12];
+
+            /* Make sure we throw away results if Y isn't a multiple of 8.
+             * We do this by pointing the result pointer at a dummy buffer
+             * we later discard.  */
+            if ((y+7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    case 6:
+                        outptr1 = dummyres;
+                    case 5:
+                        outptr2 = dummyres;
+                    case 4:
+                        outptr3 = dummyres;
+                    case 3:
+                        outptr4 = dummyres;
+                    case 2:
+                        outptr5 = dummyres;
+                    case 1:
+                        outptr6 = dummyres;
+                    case 0:
+                        outptr7 = dummyres;
+                    default:
+                        break;
+                }
+            }
+
+            /* For ragged X, manually copy over the valid results. */
+            if ((i+11) >= xmax) {
+                for (int xi=0; xi<12; xi++) {
+                    if ((i+xi) < xmax) {
+                        *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+                        outptr0++;
+                        *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+                        outptr1++;
+                        *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+                        outptr2++;
+                        *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+                        outptr3++;
+                        *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+                        outptr4++;
+                        *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+                        outptr5++;
+                        *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+                        outptr6++;
+                        *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+                        outptr7++;
+                    }
+                }
+                inptr += 96;
+            } else {
+                /* Optimized routine to copy an entire block */
+                __asm __volatile (
+                    // Rows 0-1
+                    "LDP	q16, q17, [%[outptr0]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr0], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr1]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr1], #32]\n"
+                    ASM_PREFETCH("[%[inptr], #768]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr]]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #32]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #64]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #832]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr0]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr0]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #896]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr1]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr1]], #16\n"
+
+                    // Rows 2-3
+                    "LDP	q16, q17, [%[outptr2]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr2], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr3]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr3], #32]\n"
+                    ASM_PREFETCH("[%[inptr], #960]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr], #96]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #128]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #160]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #1024]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr2]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr2]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[inptr], #1088]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr3]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr3]], #16\n"
+
+                    // Rows 4-5
+                    ASM_PREFETCH("[%[outptr0], #80]")
+                    "LDP	q16, q17, [%[outptr4]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr4], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr5]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr5], #32]\n"
+                    ASM_PREFETCH("[%[outptr1], #80]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr], #192]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #224]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #256]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr2], #80]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr4]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr4]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr3], #80]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr5]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr5]], #16\n"
+
+                    // Rows 6-7
+                    ASM_PREFETCH("[%[outptr4], #80]")
+                    "LDP	q16, q17, [%[outptr6]]\n"
+                    "FMUL	v16.4s, v16.4s, %[bv].4s\n"
+                    "LDR	q18, [%[outptr6], #32]\n"
+                    "FMUL	v17.4s, v17.4s, %[bv].4s\n"
+                    "LDP	q19, q20, [%[outptr7]]\n"
+                    "FMUL	v18.4s, v18.4s, %[bv].4s\n"
+                    "LDR	q21, [%[outptr7], #32]\n"
+                    ASM_PREFETCH("[%[outptr5], #80]")
+                    "FMUL	v19.4s, v19.4s, %[bv].4s\n"
+                    "LDP	q0,  q1,  [%[inptr], #288]\n"
+                    "FMUL	v20.4s, v20.4s, %[bv].4s\n"
+                    "LDP	q2,  q3,  [%[inptr], #320]\n"
+                    "FMUL	v21.4s, v21.4s, %[bv].4s\n"
+                    "LDP	q4,  q5,  [%[inptr], #352]\n"
+                    "FMLA	v16.4s, v0.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr6], #128]")
+                    "FMLA	v17.4s, v1.4s, %[av].4s\n"
+                    "STP	q16, q17, [%[outptr6]], #32\n"
+                    "FMLA	v18.4s, v2.4s, %[av].4s\n"
+                    "STR	q18, [%[outptr6]], #16\n"
+                    "FMLA	v19.4s, v3.4s, %[av].4s\n"
+                    ASM_PREFETCH("[%[outptr7], #128]")
+                    "FMLA	v20.4s, v4.4s, %[av].4s\n"
+                    "STP	q19, q20, [%[outptr7]], #32\n"
+                    "FMLA	v21.4s, v5.4s, %[av].4s\n"
+                    "STR	q21, [%[outptr7]], #16\n"
+                    "ADD	%[inptr], %[inptr], #384\n"
+                : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
+                  [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                  [inptr] "+r" (inptr)
+                : [av] "w" (av), [bv] "w" (bv)
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", "q18", "q19", "q20", "q21"
+                );
+            }
+        }
+    }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
new file mode 100644
index 0000000..4f23333
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "a64_merge_float_12x8.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
new file mode 100644
index 0000000..d2f8ba9
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef CYCLE_PROFILING
+
+#include "../perf.h"
+
+class profiler {
+private:
+    static const int maxevents = 10000;
+    unsigned long times[maxevents];
+    int events[maxevents];
+    int currentevent;
+    int countfd;
+
+public:
+    profiler() {
+        currentevent=0;
+        countfd=open_cycle_counter();
+    }
+
+    ~profiler() {
+        close(countfd);
+        int tots[5];
+        unsigned long counts[5];
+        const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+
+        for (int i=1; i<5; i++) {
+            tots[i] = 0;
+            counts[i] = 0;
+        }
+
+        printf("Profiled events:\n");
+        for (int i=0; i<currentevent; i++) {
+            printf("%10s: %ld\n", descs[events[i]-1], times[i]);
+            tots[events[i]]++;
+            counts[events[i]] += times[i];
+        }
+
+        printf("%20s  %9s %9s %9s\n", "", "Events", "Total", "Average");
+        for (int i=1; i<5; i++) {
+            printf("%20s: %9d %9ld %9ld\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i]);
+        }
+    }
+
+    template <typename T>
+    void operator() (int i, T func) {
+        if (currentevent==maxevents) {
+            func();
+        } else {
+            start_counter(countfd);
+            func();
+            long long cycs = stop_counter(countfd);
+            events[currentevent] = i;
+            times[currentevent++] = cycs;
+        }
+    }
+};
+
+#else
+
+class profiler {
+public:
+    template <typename T>
+    void operator() (int i, T func) {
+        func();
+    }
+};
+
+#endif
+
+#define PROFILE_PREPA 1
+#define PROFILE_PREPB 2
+#define PROFILE_KERNEL 3
+#define PROFILE_MERGE 4
diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/arm_compute/core/NEON/kernels/assembly/transform.hpp
new file mode 100644
index 0000000..717506f
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transform.hpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/*
+ * Generic transform.
+ *
+ * Assuming the untransposed case, this works by first reading <BlockBy>
+ * consecutive values from the first input row.  This same number of values
+ * are then read from the next <IntBy-1> rows.  Now return to the first
+ * input row and repeat.
+ *
+ * Need to cope with the work requested in either dimension not actually
+ * being a multiple of the block sizes.
+ */
+template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
+struct TransformImpl {
+    template <typename TOut, typename TIn>
+    static void Transform(TOut* out, const TIn* const in, const int stride,
+                          const int y0, const int ymax, const int x0, const int xmax) {
+        const int n_whole_y_blocks = (ymax - y0) / IntBy;
+        const int y_remainders = (ymax - y0) % IntBy;
+        const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+
+        const int n_whole_x_blocks = (xmax - x0) / BlockBy;
+        const int x_remainders = (xmax - x0) % BlockBy;
+        const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+
+        // "Y" loop: advance down the rows of the source IntBy rows at a time.
+        // Set up fill_rows to show the number rows to copy from, and blank_rows
+        // for the number of blank rows to add.
+        for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
+            int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+            int blank_rows = IntBy - fill_rows;
+
+            int y_base = y0 + (y_block * IntBy);
+
+            // So now advance along this block of rows, BlockBy columns at a time.
+            for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
+                int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+                int blank_cols = BlockBy - fill_cols;
+
+                int x_base = x0 + (x_block * BlockBy);
+
+                for (int row = 0; row < fill_rows; row++) {
+                    for (int col = 0; col < fill_cols; col++) {
+                        // In-range copy.  If it's transposed, we reverse the sense of rows and columns here.
+                        if (Transposed) {
+                            *out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
+                        } else {
+                            *out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
+                        }
+                    }
+                    // "col" tail - row is in range but column is out of range.
+                    for (int col=0; col < blank_cols; col++) {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+                // "row" tail - row is out of range so fill with zeros always.
+                for (int row = 0; row < blank_rows; row++) {
+                    for (int col=0; col < (fill_cols + blank_cols); col++) {
+                        *out++ = static_cast<TOut>(0);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    static inline void Transform(T* out, const T* const in, const int stride,
+                                 const int k0, const int kmax, const int x0, const int xmax) {
+        Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
+    }
+};
+
+/*****************************************************************************/
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
+void Transform(
+  TOut* out, const TIn* const in, const int stride,
+  const int k0, const int kmax, const int x0, const int xmax
+) {
+  // Redirect to a specialised implementation predicated on argument size.
+  TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+    out, in, stride, k0, kmax, x0, xmax
+  );
+}
+/*****************************************************************************/
+
+#include "transforms/list.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..6317424
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "../asmlib.hpp"
+
+#include <arm_neon.h>
+
+template<>
+template<typename T>
+void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint32_t *outptr = (uint32_t *)out;
+    const uint32_t *inptr = (uint32_t *)in;
+
+    uint32_t zerobuff[8];
+
+    for (int y=y0; y<ymax; y+=8) {
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+        const uint32_t *inptr6 = inptr5 + ldin;
+        const uint32_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;x>7;x-=8) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    /* Everything falls through in here */
+                    case 6:
+                        inptr1 = zerobuff;
+                    case 5:
+                        inptr2 = zerobuff;
+                    case 4:
+                        inptr3 = zerobuff;
+                    case 3:
+                        inptr4 = zerobuff;
+                    case 2:
+                        inptr5 = zerobuff;
+                    case 1:
+                        inptr6 = zerobuff;
+                    case 0:
+                        inptr7 = zerobuff;
+                    default:
+                        break;
+                }
+            }
+
+            __asm __volatile (
+                // Load up 8 elements (2 vectors) from each of 8 sources.
+                "LDP        q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
+                "LDP        q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
+                "LDP        q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
+                "ZIP1       v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDP        q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
+                "ZIP1       v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+                "LDP        q8, q9, [%[inptr4]], #32\n"
+                "LDP        q10, q11, [%[inptr5]], #32\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDP        q12, q13, [%[inptr6]], #32\n"
+                "ZIP1       v18.4s, v8.4s, v12.4s\n"
+                "LDP        q14, q15, [%[inptr7]], #32\n"
+                "ZIP1       v19.4s, v10.4s, v14.4s\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1       v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+                ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2       v16.4s, v0.4s, v4.4s\n"
+                "ZIP2       v17.4s, v2.4s, v6.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+                "ZIP2       v18.4s, v8.4s, v12.4s\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2       v19.4s, v10.4s, v14.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP1       v16.4s, v1.4s, v5.4s\n"
+                "ZIP1       v17.4s, v3.4s, v7.4s\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "STP        q20, q21, [%[outptr]], #32\n" // Third element
+
+                "ZIP1       v18.4s, v9.4s, v13.4s\n"
+                "ZIP1       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Fourth element
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+
+                "ZIP2       v16.4s, v1.4s, v5.4s\n"
+                "ZIP2       v17.4s, v3.4s, v7.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Fifth element
+
+                "ZIP2       v18.4s, v9.4s, v13.4s\n"
+                "ZIP2       v19.4s, v11.4s, v15.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Sixth element
+
+                "ZIP1       v20.4s, v16.4s, v17.4s\n"
+                "ZIP1       v21.4s, v18.4s, v19.4s\n"
+                "STP        q20, q21, [%[outptr]], #32\n" // Seventh element
+
+                "ZIP2       v22.4s, v16.4s, v17.4s\n"
+                "ZIP2       v23.4s, v18.4s, v19.4s\n"
+                "STP        q22, q23, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif  // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
new file mode 100644
index 0000000..3cf6b41
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+//#include "a32_interleave_6way_32bit.hpp"
+//#include "a32_transpose_interleave_8way_32bit.hpp"
+//#include "a64_interleave_8way_16bit.hpp"
+#include "a64_interleave_8way_32bit.hpp"
+//#include "a64_interleave_8way_half_to_float.hpp"
+//#include "a64_transpose_interleave_12way_16bit.hpp"
+//#include "a64_transpose_interleave_12way_half_to_float.hpp"
+//#include "a64_transpose_interleave_24way_16bit.hpp"
+#include "transpose_interleave_common.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000..882da9c
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon {
+  // Override the moveblock_1xY methods to improve performance
+  static inline void moveblock_1x1(const TIn *&in0, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+  }
+
+  static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in1++);
+    }
+  }
+
+  static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) {
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in0++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in1++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in2++);
+    }
+    for (unsigned int i = 0; i < IntBy; i++) {
+      *out++ = static_cast<TOut>(*in3++);
+    }
+  }
+
+  static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
+    const auto ldin = stride;
+
+    TOut *outarray = out;
+    const TIn *inarray = in;
+    TOut *outptr_base = outarray;
+    const TIn *inptr_base = inarray + x0 + (k0 * ldin);
+    int ldout = (kmax - k0) * IntBy;
+
+    int k=(kmax-k0);
+    for ( ; k>3; k-=4) {
+        TOut *outptr = outptr_base;
+        const TIn *inptr = inptr_base;
+        const TIn *inptr1 = inptr + ldin;
+        const TIn *inptr2 = inptr1 + ldin;
+        const TIn *inptr3 = inptr2 + ldin;
+
+        prefetch_3x(inptr);
+        prefetch_3x(inptr1);
+        prefetch_3x(inptr2);
+        prefetch_3x(inptr3);
+
+        outptr_base += IntBy * 4;
+        inptr_base += ldin * 4;
+
+        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+            moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+            outptr += ldout;
+        }
+    }
+
+    if (k) {
+        TOut *outptr = outptr_base;
+        const TIn *inptr = inptr_base;
+        const TIn *inptr1 = inptr + ldin;
+        const TIn *inptr2 = inptr1 + ldin;
+
+        prefetch_3x(inptr);
+        prefetch_3x(inptr1);
+        prefetch_3x(inptr2);
+
+        for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
+            switch(k) {
+                case 3:
+                    moveblock_1x2(inptr, inptr1, outptr);
+                    moveblock_1x1(inptr2, outptr + IntBy * 2);
+                    break;
+
+                case 2:
+                    moveblock_1x2(inptr, inptr1, outptr);
+                    break;
+
+                case 1:
+                    moveblock_1x1(inptr, outptr);
+                    break;
+                default:
+                    break;
+            }
+
+            outptr  += ldout;
+        }
+    }
+
+    // Cope with ragged X cases
+    const unsigned int overflow = (xmax - x0) % IntBy;
+    if (overflow) {
+        const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+        TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
+
+        for (int k=(kmax-k0); k>0; k--) {
+            const TIn *inptr = inptr_base;
+            inptr_base += ldin;
+
+            for (unsigned int x=0; x < IntBy; x++) {
+                TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+                *outptr++ = val;
+            }
+        }
+    }
+}
+};