COMPMID-3578: Update FP32/int8 kernel selection.

Upgrade the current 'is_preferred()' mechanism with a new framework,
where kernels instead provide an estimated cycle count figure.
Compatibility with old mechanism is achieved via a wrapper which
replaces a "true" result with an estimate of 0, and a "false" result
with UINT64_MAX.

This mechanism is then used to select between 'interleaved' and
'hybrid' FP32 NEON kernels.  This uses a simple system based on
counting MACs performed and bytes of data transferred (for
rearrange/merge operations) and dividing by fixed performance figures,
which are provided for A53, A55, A73 and 'default' figures (based on
A76).

Separately, a new route for performing int8 GEMMs by using the int16
kernel is provided.  This performs significantly (for uint8) or
slightly (for int8) better on A53 than the existing int8 route.
Optimized 8-to-16 bit transforms are also included.

Change-Id: I53b2e59eb9368793c78c2081e17d2445361bcc47
Signed-off-by: David Mansell <David.Mansell@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/250120
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3609
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index be6dfac..fd6bfc3 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -45,7 +45,8 @@
     A53,
     A55r0,
     A55r1,
-    X1
+    X1,
+    A73
 };
 
 /** Global memory policy.
@@ -99,6 +100,10 @@
         {
             return std::string("X1");
         }
+        case CPUModel::A73:
+        {
+            return std::string("A73");
+        }
         default:
         {
             ARM_COMPUTE_ERROR("Invalid CPUModel.");
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index aa206e3..ddb438f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -120,13 +120,13 @@
     [](const GemmArgs &args) { return (args._Nsize < 12); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4x8, float, float>(args); }
 },
-{
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
     "hybrid_fp32_mla_16x4",
     [](const GemmArgs &args) { return (args._Ksize >= 4); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); },
+    [](const GemmArgs &args) { return GemmHybrid<hybrid_fp32_mla_16x4, float, float>::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
-},
+),
 
 #ifdef __ARM_FEATURE_SVE
 {
@@ -138,21 +138,21 @@
 },
 #endif // __ARM_FEATURE_SVE
 // Pretranposed, 2D split
-{
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED_2D,
     "sgemm_12x8_2d",
     nullptr,
-    [](const GemmArgs &args) { return args._maxthreads >= 8; },
+    [](const GemmArgs &args) { return GemmInterleavedPretransposed2d<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
     [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
-},
+),
 // 1D split (with pretransposed or not)
-{
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sgemm_12x8_1d",
     nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
     [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
-},
+),
 #endif // __aarch64__
 
 #ifdef __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 353d681..7a983ed 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -23,17 +23,15 @@
  */
 #pragma once
 
-#include <assert.h>
-
 #include <algorithm>
+#include <cassert>
 
 #include "arm_gemm.hpp"
 #include "bias_adder.hpp"
 #include "ndrange.hpp"
-#include "utils.hpp"
-
-#include "mergeresults.hpp"
+#include "performance_parameters.hpp"
 #include "transform.hpp"
+#include "utils.hpp"
 
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
@@ -252,6 +250,28 @@
     void set_pretransposed_B_data(void *in_buffer) override {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
+
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        // Note: Current hybrid kernels don't actually round up height (they
+        // have paths for each possible height).  Might need to make this
+        // configurable in future.
+        uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+
+        float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+
+        // TODO: A bit of a kludge here: current hybrid kernels incur extra
+        // overhead where the width is not a multiple of kernel width.  It's
+        // most noticable where the overall width is quite low, so add 15%
+        // penalty for such widths.
+        if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
+            mac_cycles *= 1.15f;
+        }
+
+        uint64_t total_cycles = mac_cycles;
+
+        return total_cycles;
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index c726d7b..261e7d2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -24,6 +24,7 @@
 
 #include "arm_gemm.hpp"
 
+#include <cstdint>
 #include <functional>
 
 namespace arm_gemm {
@@ -37,7 +38,7 @@
     const GemmMethod                                                               method;
     const char *                                                                   name;
     std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported;
-    std::function<bool(const GemmArgs &, const OutputStage &)>                     is_recommended;
+    std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate;
     std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate;
 
     bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
@@ -48,17 +49,27 @@
         }
     }
 
-    bool do_is_recommended(const GemmArgs &args, const OutputStage &os) const {
-        if (is_recommended != nullptr) {
-            return is_recommended(args, os);
+    uint64_t do_cycle_estimate(const GemmArgs &args, const OutputStage &os) const {
+        if (cycle_estimate != nullptr) {
+            return cycle_estimate(args, os);
         } else {
-            return true;
+            return 0;
         }
     }
 
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation &operator= (const GemmImplementation &) = default;
+
     GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
         return instantiate(args, os);
     }
+
+    GemmImplementation(GemmMethod m, const char *n,
+                       std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
+                       method(m), name(n), is_supported(is_supported),
+                       cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
+                       instantiate(instantiate) {   }
 };
 
 /* Slightly different version of above for straightforward GEMMs with no
@@ -69,7 +80,7 @@
     const GemmMethod                                          method;
     const char *                                              name;
     std::function<bool(const GemmArgs &)>                     is_supported;
-    std::function<bool(const GemmArgs &)>                     is_recommended;
+    std::function<uint64_t(const GemmArgs &)>                 cycle_estimate;
     std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate;
 
     bool do_is_supported(const GemmArgs &args, const Nothing &) const {
@@ -80,17 +91,42 @@
         }
     }
 
-    bool do_is_recommended(const GemmArgs &args, const Nothing &) const {
-        if (is_recommended != nullptr) {
-            return is_recommended(args);
+    uint64_t do_cycle_estimate(const GemmArgs &args, const Nothing &) const {
+        if (cycle_estimate != nullptr) {
+            return cycle_estimate(args);
         } else {
-            return true;
+            return 0;
         }
     }
 
     GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const Nothing &) const {
         return instantiate(args);
     }
+
+
+    static GemmImplementation with_estimate(GemmMethod m, const char *n,
+                       std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
+        GemmImplementation impl(m,n);
+
+        impl.is_supported=is_supported;
+        impl.cycle_estimate=cycle_estimate;
+        impl.instantiate=instantiate;
+
+        return impl;
+    }
+
+    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {}
+
+    GemmImplementation(GemmMethod m, const char *n,
+                       std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) :
+                       method(m), name(n), is_supported(is_supported),
+                       cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
+                       instantiate(instantiate) {   }
+
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation &operator=(const GemmImplementation &) = default;
 };
 
 /* "Master" function implemented for each valid combination of types.
@@ -103,13 +139,11 @@
 /*
  * Select a GEMM implementation for the given arguments.
  *
- * The logic here returns the first method on the list which supports the
+ * The logic here returns the method on the list which supports the
  * requested problem parameters, matches the provided filters (method and/or
- * name string match) and recommends itself.
- *
- * If there is no such method, it will return the first method which
- * supports the requested parameters and passes the filters, regardless of
- * recommendation.
+ * name string match) and offers the lowest cycle estimate.  A cycle
+ * estimate of '0' is treated as a special value, causing the corresponding
+ * method to be selected immediately.
  *
  * If no method supports the requested parameters and passes the filters,
  * this function returns false and doesn't touch the provided pointer
@@ -121,6 +155,7 @@
     const GemmConfig *cfg = args._cfg;
 
     const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
+    uint64_t best_estimate = 0;
 
     for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Skip if this implementation doesn't support these args. */
@@ -138,27 +173,24 @@
             continue;
         }
 
-        /* At this point, if we don't have a saved implementation, save this
-         * one.  This is so that we always return something if a filter
-         * matches, even if it doesn't recommend itself.
-         */
-        if (saved_impl == nullptr) {
-            saved_impl=i;
+        /* Test the cycle estimate */
+        uint64_t estimate = i->do_cycle_estimate(args, os);
+
+        /* Short circuit - if the estimate is zero, return this one immediately. */
+        if (estimate==0) {
+            impl=i;
+            return true;
         }
 
-        /* Check that this method recommends itself. */
-        if (!i->do_is_recommended(args, os)) {
-            continue;
+        /* Otherwise, remember this is our best so far if we don't yet have
+         * a valid candidate, or we beat the estimate.  */
+        if ((saved_impl == nullptr) || (estimate < best_estimate)) {
+            saved_impl = i;
+            best_estimate = estimate;
         }
-
-        impl=i;
-
-        return true;
     }
 
-    /* We didn't find an option matching the filters that recommended
-     * itself.  But if we found something earlier that matched the filters
-     * but wasn't recommended, return it here.  */
+    /* Return whichever method gave the best estimate. */
     if (saved_impl != nullptr) {
         impl = saved_impl;
         return true;
@@ -183,7 +215,7 @@
             continue;
         }
 
-        res.push_back(KernelDescription(i->method, i->name, i==default_impl));
+        res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
     }
 
     return res;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 3ee4749..bddcc8d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -122,6 +122,13 @@
     [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_s16_12x8",
+    nullptr,
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(args); },
+},
+{
     GemmMethod::GEMM_INTERLEAVED_2D,
     "gemm_s8_4x4_2d",
     nullptr,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 3b82949..c4dceef 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -23,15 +23,14 @@
  */
 #pragma once
 
-#include <stdio.h>
-#include <assert.h>
-
 #include <algorithm>
+#include <cassert>
 
 #include "arm_gemm.hpp"
 #include "utils.hpp"
 
 #include "mergeresults.hpp"
+#include "performance_parameters.hpp"
 #include "transform.hpp"
 
 #ifdef CYCLE_PROFILING
@@ -149,6 +148,33 @@
         return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
     }
 
+    static unsigned int get_k_block_size(const GemmArgs &args) {
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+        unsigned int k_block;
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, num_k_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
+    }
 
 public:
     GemmInterleaved(GemmInterleaved &) = delete;
@@ -158,35 +184,14 @@
     GemmInterleaved(const GemmArgs &args)
                     : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
                       _nbatches(args._nbatches), _nmulti(args._nmulti),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads) {
-        const unsigned int L1_size = _ci->get_L1_cache_size();
+                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _k_block(get_k_block_size(args)) {
         const unsigned int L2_size = _ci->get_L2_cache_size();
 
         assert(_maxthreads > 0);
 
         // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            _k_block = args._cfg->inner_block_size;
-        } else {
-            // k_block: Find out how much of the larger array can be loaded into half the cache.
-            // This should account for associative caches.
-            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-            // Needs to be (at least a single) multiple of the K unroll level.
-            _k_block /= strategy::k_unroll();
-            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
-            // Now tune to presented problem size; this is how many blocks we need.
-            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
-            // So divide the space equally into that many blocks.
-            _k_block = iceildiv(_Ksize, num_k_blocks);
-
-            // And round UP to the K unroll level required.
-            _k_block = iceildiv(_k_block, strategy::k_unroll());
-            _k_block *= strategy::k_unroll();
-        }
-
+        // TODO: Move outer block into a static function too.
         if (args._cfg && args._cfg->outer_block_size) {
             _x_block = args._cfg->outer_block_size;
         } else {
@@ -422,6 +427,31 @@
     void set_pretransposed_B_data(void *in_buffer) override {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
+
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+
+        uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+        uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+        uint64_t merge_bytes   = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+        float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+        float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+        float merge_cycles   = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+        float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+        // We can't thread over multis or width, which makes this a poor
+        // choice in many threaded cases.  Penalize that here.
+        float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
+
+        if (parallelism_available < args._maxthreads) {
+            total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+        }
+
+        return static_cast<uint64_t>(total_cycles);
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index ebe33ab..bdccd05 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -35,6 +35,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cmath>
 
 // Some macros used to decide how much working space to allocate.
 // Round allocations up to the next cache line.
@@ -301,6 +302,36 @@
         }
     }
 
+    static unsigned int get_k_block_size(const GemmArgs &args) {
+        // Work out blocking parameters, or override from provided GemmConfig
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+        unsigned int k_block;
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = iceildiv(k_block, strategy::k_unroll());
+        k_block *= strategy::k_unroll();
+
+        return k_block;
+    }
+
 public:
     GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
     GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
@@ -315,8 +346,8 @@
     ,    _nmulti(args._nmulti)
     ,    _act(args._act)
     ,    _maxthreads(args._maxthreads)
-    ,    _nthreads(args._maxthreads) 
-
+    ,    _nthreads(args._maxthreads)
+    ,    _k_block(get_k_block_size(args))
     // Work out the rounded size of M - needed for some buffers.
     ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
     ,    _Mround     ( _Mround_div * strategy::out_height()     )
@@ -326,32 +357,8 @@
     {
         assert(_maxthreads > 0);
 
-        const unsigned int L1_size = _ci->get_L1_cache_size();
         const unsigned int L2_size = _ci->get_L2_cache_size();
 
-        // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            _k_block = args._cfg->inner_block_size;
-        } else {
-            // k_block: Find out how much of the larger array can be loaded into half the cache.
-            // This should account for associative caches.
-            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-            // Needs to be (at least a single) multiple of the K unroll level.
-            _k_block /= strategy::k_unroll();
-            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
-            // Now tune to presented problem size; this is how many blocks we need.
-            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
-            // So divide the space equally into that many blocks.
-            _k_block = iceildiv(_Ksize, num_k_blocks);
-
-            // And round UP to the K unroll level required.
-            _k_block = iceildiv(_k_block, strategy::k_unroll());
-            _k_block *= strategy::k_unroll();
-        }
-
         if (args._cfg && args._cfg->outer_block_size) {
             _x_block = args._cfg->outer_block_size;
         } else {
@@ -381,6 +388,10 @@
         return { m, n };
     }
 
+    bool supports_dynamic_scheduling() const override {
+        return true;
+    }
+
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
     void set_nthreads(int nthreads) override {
         _nthreads = std::min(nthreads, _maxthreads);
@@ -495,7 +506,60 @@
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
 
-    ~GemmInterleavedPretransposed2d() override { }
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+        unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
+        unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width());
+
+        uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+        uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+        uint64_t merge_bytes   = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+        // Wide problems incur extra preparation cost, as it is done per thread.
+        // Duplicate the logic the scheduler will later use to figure out how much that will affect us
+        float ratio = m_blocks / static_cast<float>(n_blocks);
+
+        unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5);
+        unsigned int height = 1;
+
+        if (ideal_height == 0) {
+            height = 1;
+        } else {
+            for (unsigned int adj=0; adj<ideal_height; adj++) {
+                const unsigned int round_down = ideal_height - adj;
+                if (args._maxthreads % round_down == 0) {
+                    height = round_down;
+                    break;
+                }
+
+                const unsigned int round_up = ideal_height + adj;
+                if (args._maxthreads % round_up == 0) {
+                    height = round_up;
+                    break;
+                }
+            }
+        }
+
+        // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height)
+        prepare_bytes *= (args._maxthreads / height);
+
+        float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+        float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+        float merge_cycles   = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+        float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+        // We can't thread over multis, which might be a problem in some
+        // threaded cases.  Penalize that here.
+        float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9;
+
+        if (parallelism_available < args._maxthreads) {
+            total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+        }
+
+        return static_cast<uint64_t>(total_cycles);
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index caab2e2..88726b1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -101,6 +101,13 @@
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_u16_12x8",
+    nullptr,
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t>(args); },
+},
+{
     GemmMethod::GEMM_HYBRID,
     "hybrid_u8u32_dot_16x4",
     [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
index 8d8ede8..4147ab6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
@@ -25,7 +25,7 @@
 
 #ifdef __aarch64__
 
-
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm
@@ -75,6 +75,22 @@
         return true;
     }
 
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        switch (ci->get_cpu_model()) {
+            case CPUModel::A55r1:
+                return { 2.866 };
+
+            case CPUModel::A53:
+                return { 1.419 };
+
+            case CPUModel::A73:
+                return { 2.551 };
+
+            default:
+                return { 6.25 };
+        }
+    }
+
     StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
 
     // Default to the generic kernel
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 5c3d640..981ce34 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -67,6 +67,22 @@
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
 
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        switch (ci->get_cpu_model()) {
+            case CPUModel::A55r1:
+                return { 3.724, 1.416, 1.113 };
+
+            case CPUModel::A53:
+                return { 2.777, 0.987, 0.898 };
+
+            case CPUModel::A73:
+                return { 2.885, 1.429, 1.163 };
+
+            default:
+                return { 6.949, 4.149, 2.826 };
+        }
+    }
+
     kern_type kernel=a64_sgemm_asimd_12x8;
 
     sgemm_12x8(const CPUInfo *ci) {
diff --git a/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
new file mode 100644
index 0000000..059ab5f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+namespace arm_gemm {
+
+struct PerformanceParameters {
+    float	kernel_macs_cycle;
+    float	prepare_bytes_cycle = 0.0f;
+    float	merge_bytes_cycle   = 0.0f;
+
+    PerformanceParameters(float k) : kernel_macs_cycle(k) { }
+    PerformanceParameters(float k, float p, float m) : kernel_macs_cycle(k), prepare_bytes_cycle(p), merge_bytes_cycle(m) { }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
new file mode 100644
index 0000000..37344a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+#include <cstdint>
+
+#include "../asmlib.hpp"
+
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    int16_t *outptr = out;
+    const int8_t *inptr = in;
+    bool first = true;
+
+    int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
+
+    for (int y=y0; y<ymax; y+=8) {
+        const int8_t *inptr0 = inptr + y * ldin + k0;
+        const int8_t *inptr1 = inptr0 + ldin;
+        const int8_t *inptr2 = inptr1 + ldin;
+        const int8_t *inptr3 = inptr2 + ldin;
+        const int8_t *inptr4 = inptr3 + ldin;
+        const int8_t *inptr5 = inptr4 + ldin;
+        const int8_t *inptr6 = inptr5 + ldin;
+        const int8_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;(x>15) || first;x-=16) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    case 6:
+                        inptr1 = zerobuff;
+                        // fall through
+                    case 5:
+                        inptr2 = zerobuff;
+                        // fall through
+                    case 4:
+                        inptr3 = zerobuff;
+                        // fall through
+                    case 3:
+                        inptr4 = zerobuff;
+                        // fall through
+                    case 2:
+                        inptr5 = zerobuff;
+                        // fall through
+                    case 1:
+                        inptr6 = zerobuff;
+                        // fall through
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            if (first) {
+                if (x<=15) {
+                    break;
+                }
+
+                first = false;
+            }
+
+            __asm __volatile (
+                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
+                "LDR	q0, [%[inptr0]], #16\n"
+                "LDR	q2, [%[inptr1]], #16\n"
+                "SSHLL2 v1.8h, v0.16b, #0\n"
+                "SSHLL  v0.8h, v0.8b, #0\n"
+                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "SSHLL2 v3.8h, v2.16b, #0\n"
+                "SSHLL  v2.8h, v2.8b, #0\n"
+                "SSHLL2 v5.8h, v4.16b, #0\n"
+                "SSHLL  v4.8h, v4.8b, #0\n"
+                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "SSHLL2 v7.8h, v6.16b, #0\n"
+                "SSHLL  v6.8h, v6.8b, #0\n"
+                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
+                "LDR	q8, [%[inptr4]], #16\n"
+                "LDR	q10, [%[inptr5]], #16\n"
+                "SSHLL2 v9.8h, v8.16b, #0\n"
+                "SSHLL  v8.8h, v8.8b, #0\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR	q12, [%[inptr6]], #16\n"
+                "SSHLL2 v11.8h, v10.16b, #0\n"
+                "SSHLL  v10.8h, v10.8b, #0\n"
+                "SSHLL2 v13.8h, v12.16b, #0\n"
+                "SSHLL  v12.8h, v12.8b, #0\n"
+                "ZIP1	v18.8h, v8.8h, v12.8h\n"
+                "LDR	q14, [%[inptr7]], #16\n"
+                "SSHLL2 v15.8h, v14.16b, #0\n"
+                "SSHLL  v14.8h, v14.8b, #0\n"
+                "ZIP1	v19.8h, v10.8h, v14.8h\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
+                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
+                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
+                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
+                ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2	v16.8h, v0.8h, v4.8h\n"
+                "ZIP2	v17.8h, v2.8h, v6.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v8.8h, v12.8h\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2	v19.8h, v10.8h, v14.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1	v16.8h, v1.8h, v5.8h\n"
+                "ZIP1	v17.8h, v3.8h, v7.8h\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP1	v18.8h, v9.8h, v13.8h\n"
+                "ZIP1	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Third element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+
+                "ZIP2	v16.8h, v1.8h, v5.8h\n"
+                "ZIP2	v17.8h, v3.8h, v7.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v9.8h, v13.8h\n"
+                "ZIP2	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
new file mode 100644
index 0000000..a3a269c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+#include <cstdint>
+
+#include "../asmlib.hpp"
+
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint16_t *outptr = out;
+    const uint8_t *inptr = in;
+    bool first = true;
+
+    uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
+
+    for (int y=y0; y<ymax; y+=8) {
+        const uint8_t *inptr0 = inptr + y * ldin + k0;
+        const uint8_t *inptr1 = inptr0 + ldin;
+        const uint8_t *inptr2 = inptr1 + ldin;
+        const uint8_t *inptr3 = inptr2 + ldin;
+        const uint8_t *inptr4 = inptr3 + ldin;
+        const uint8_t *inptr5 = inptr4 + ldin;
+        const uint8_t *inptr6 = inptr5 + ldin;
+        const uint8_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;(x>15) || first;x-=16) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    case 6:
+                        inptr1 = zerobuff;
+                        // fall through
+                    case 5:
+                        inptr2 = zerobuff;
+                        // fall through
+                    case 4:
+                        inptr3 = zerobuff;
+                        // fall through
+                    case 3:
+                        inptr4 = zerobuff;
+                        // fall through
+                    case 2:
+                        inptr5 = zerobuff;
+                        // fall through
+                    case 1:
+                        inptr6 = zerobuff;
+                        // fall through
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            if (first) {
+                if (x<=15) {
+                    break;
+                }
+
+                first = false;
+            }
+
+            __asm __volatile (
+                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
+                "LDR	q0, [%[inptr0]], #16\n"
+                "LDR	q2, [%[inptr1]], #16\n"
+                "USHLL2 v1.8h, v0.16b, #0\n"
+                "USHLL  v0.8h, v0.8b, #0\n"
+                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "USHLL2 v3.8h, v2.16b, #0\n"
+                "USHLL  v2.8h, v2.8b, #0\n"
+                "USHLL2 v5.8h, v4.16b, #0\n"
+                "USHLL  v4.8h, v4.8b, #0\n"
+                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "USHLL2 v7.8h, v6.16b, #0\n"
+                "USHLL  v6.8h, v6.8b, #0\n"
+                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
+                "LDR	q8, [%[inptr4]], #16\n"
+                "LDR	q10, [%[inptr5]], #16\n"
+                "USHLL2 v9.8h, v8.16b, #0\n"
+                "USHLL  v8.8h, v8.8b, #0\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR	q12, [%[inptr6]], #16\n"
+                "USHLL2 v11.8h, v10.16b, #0\n"
+                "USHLL  v10.8h, v10.8b, #0\n"
+                "USHLL2 v13.8h, v12.16b, #0\n"
+                "USHLL  v12.8h, v12.8b, #0\n"
+                "ZIP1	v18.8h, v8.8h, v12.8h\n"
+                "LDR	q14, [%[inptr7]], #16\n"
+                "USHLL2 v15.8h, v14.16b, #0\n"
+                "USHLL  v14.8h, v14.8b, #0\n"
+                "ZIP1	v19.8h, v10.8h, v14.8h\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
+                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
+                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
+                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
+                ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2	v16.8h, v0.8h, v4.8h\n"
+                "ZIP2	v17.8h, v2.8h, v6.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v8.8h, v12.8h\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2	v19.8h, v10.8h, v14.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1	v16.8h, v1.8h, v5.8h\n"
+                "ZIP1	v17.8h, v3.8h, v7.8h\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP1	v18.8h, v9.8h, v13.8h\n"
+                "ZIP1	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Third element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+
+                "ZIP2	v16.8h, v1.8h, v5.8h\n"
+                "ZIP2	v17.8h, v3.8h, v7.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v9.8h, v13.8h\n"
+                "ZIP2	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 2c698b2..b825e1c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -28,6 +28,8 @@
 #include "a64_interleave_8way_32bit.hpp"
 #include "a64_interleave_8way_block4_8bit.hpp"
 #include "a64_interleave_8way_half_to_float.hpp"
+#include "a64_interleave_8way_s8_to_s16.hpp"
+#include "a64_interleave_8way_u8_to_u16.hpp"
 #include "a64_transpose_interleave_12way_16bit.hpp"
 #include "a64_transpose_interleave_12way_half_to_float.hpp"
 #include "a64_transpose_interleave_24way_16bit.hpp"
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index e3e1547..58db511 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -48,12 +48,13 @@
 
 struct KernelDescription
 {
-    GemmMethod  method     = GemmMethod::DEFAULT;
-    std::string name       = "";
-    bool        is_default = false;
+    GemmMethod  method         = GemmMethod::DEFAULT;
+    std::string name           = "";
+    bool        is_default     = false;
+    uint64_t    cycle_estimate = 0;
 
-    KernelDescription(GemmMethod m, std::string n, bool d = false)
-        : method(m), name(n), is_default(d)
+    KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
+        : method(m), name(n), is_default(d), cycle_estimate(c)
     {
     }
     KernelDescription() noexcept
diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 865e038..4d6caae 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp
@@ -126,6 +126,9 @@
             case 0xd44: // X1
                 model = CPUModel::X1;
                 break;
+            case 0xd09: // A73
+                model = CPUModel::A73;
+                break;
             case 0xd0a: // A75
                 if(variant != 0)
                 {
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 2f7177a..372d4e7 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -2292,6 +2292,12 @@
         case CPUModel::A55r1:
             os << "A55r1";
             break;
+        case CPUModel::A73:
+            os << "A73";
+            break;
+        case CPUModel::X1:
+            os << "X1";
+            break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }