COMPMID-881: RSH new arm_gemm interface.

Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
new file mode 100644
index 0000000..b7cc3d7
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <thread>
+
+extern int l1_cache_size;
+extern int l2_cache_size;
+extern int force_cpu;
+
+#ifdef __ANDROID__
+inline unsigned long      stoul( const std::string& str, std::size_t* pos = 0, int base = 10 )
+{
+        char *end;
+        const unsigned long ret = strtoul( str.c_str(), &end, base);
+        *pos = end - str.c_str();
+        return ret;
+}
+inline int       stoi( const std::string& str, std::size_t* pos = 0, int base = 10 )
+{
+        return atoi(str.c_str());        
+}
+#endif
+
+
+#ifndef BARE_METAL
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP      (1 << 10)
+#endif
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID        (1 << 11)
+#endif
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP      (1 << 20)
+#endif
+
+#define CPUINFO_HACK
+
+//unsigned int get_cpu_impl();
+
+
+/* CPU models - we only need to detect CPUs we have
+ * microarchitecture-specific code for.
+ *
+ * Architecture features are detected via HWCAPs.
+ */
+enum class CPUModel {
+    GENERIC    = 0x0001,
+    A53        = 0x0010,
+    A55r0      = 0x0011,
+    A55r1      = 0x0012,
+};
+
+class CPUInfo
+{
+private:
+    struct PerCPUData {
+        CPUModel  model      = CPUModel::GENERIC;
+        uint32_t  midr       = 0;
+        bool      model_set  = false;
+    };
+
+    std::vector<PerCPUData> _percpu={};
+
+    bool _cpuid   = false;
+    bool _fp16    = false;
+    bool _dotprod = false;
+
+    unsigned int L1_cache_size = 32768;
+    unsigned int L2_cache_size = 262144;
+
+    /* Convert an MIDR register value to a CPUModel enum value. */
+    CPUModel midr_to_model(const unsigned int midr) const {
+        CPUModel model;
+
+        // Unpack variant and CPU ID
+        int variant = (midr >> 20) & 0xF;
+        int cpunum = (midr >> 4) & 0xFFF;
+
+        /* Only CPUs we have code paths for are detected.  All other CPUs
+         * can be safely classed as "GENERIC"
+         */
+
+        switch(cpunum) {
+            case 0xd03:
+                model = CPUModel::A53;
+                break;
+
+            case 0xd05:
+                if (variant) {
+                    model = CPUModel::A55r1;
+                } else {
+                    model = CPUModel::A55r0;
+                }
+                break;
+
+            default:
+                model = CPUModel::GENERIC;
+                break;
+        }
+
+        return model;
+    }
+
+    /* If the CPUID capability is present, MIDR information is provided in
+       /sys.  Use that to populate the CPU model table.  */
+    void populate_models_cpuid() {
+        for (unsigned long int i=0; i<_percpu.size(); i++) {
+            std::stringstream str;
+            str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
+            std::ifstream file;
+
+            file.open(str.str(), std::ios::in);
+
+            if (file.is_open()) {
+                std::string line;
+
+                if (bool(getline(file, line))) {
+                    const unsigned long midr = stoul(line, nullptr, 16);
+
+                    _percpu[i].midr      = (midr & 0xffffffff);
+                    _percpu[i].model     = midr_to_model(_percpu[i].midr);
+                    _percpu[i].model_set = true;
+                }
+            }
+        }
+    }
+
+    /* If "long-form" cpuinfo is present, parse that to populate models. */
+    void populate_models_cpuinfo() {
+        std::regex   proc_regex("^processor.*(\\d+)$");
+        std::regex   imp_regex("^CPU implementer.*0x(..)$");
+        std::regex   var_regex("^CPU variant.*0x(.)$");
+        std::regex   part_regex("^CPU part.*0x(...)$");
+        std::regex   rev_regex("^CPU revision.*(\\d+)$");
+
+        std::ifstream file;
+        file.open("/proc/cpuinfo", std::ios::in);
+
+        if (file.is_open()) {
+            std::string line;
+            int midr=0;
+            int curcpu=-1;
+
+            while(bool(getline(file, line))) {
+                std::smatch match;
+
+                if (std::regex_match(line, match, proc_regex)) {
+                    std::string id = match[1];
+                    int newcpu=stoi(id, nullptr, 0);
+
+                    if (curcpu >= 0 && midr==0) {
+                        // Matched a new CPU ID without any description of the previous one - looks like old format.
+                        return;
+                    }
+
+                    if (curcpu >= 0) {
+                        _percpu[curcpu].midr      = midr;
+                        _percpu[curcpu].model     = midr_to_model(midr);
+                        _percpu[curcpu].model_set = true;
+
+                        printf("CPU %d: %x\n",curcpu,midr);
+                    }
+
+                    midr=0;
+                    curcpu=newcpu;
+
+                    continue;
+                }
+
+                if (std::regex_match(line, match, imp_regex)) {
+                    int impv = stoi(match[1], nullptr, 16);
+                    midr |= (impv << 24);
+                    continue;
+                }
+
+                if (std::regex_match(line, match, var_regex)) {
+                    int varv = stoi(match[1], nullptr, 16);
+                    midr |= (varv << 16);
+                    continue;
+                }
+
+                if (std::regex_match(line, match, part_regex)) {
+                    int partv = stoi(match[1], nullptr, 16);
+                    midr |= (partv << 4);
+                    continue;
+                }
+
+                if (std::regex_match(line, match, rev_regex)) {
+                    int regv = stoi(match[1], nullptr, 10);
+                    midr |= (regv);
+                    midr |= (0xf << 16);
+                    continue;
+                }
+            }
+
+            if (curcpu >= 0) {
+                _percpu[curcpu].midr      = midr;
+                _percpu[curcpu].model     = midr_to_model(midr);
+                _percpu[curcpu].model_set = true;
+
+                printf("CPU %d: %x\n",curcpu,midr);
+            }
+        }
+    }
+
+    /* Identify the maximum valid CPUID in the system.  This reads
+     * /sys/devices/system/cpu/present to get the information.  */
+    int get_max_cpus() {
+        int max_cpus = 1;
+
+#ifndef BARE_METAL
+        std::ifstream CPUspresent;
+        CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+        bool success = false;
+
+        if (CPUspresent.is_open()) {
+            std::string line;
+
+            if (bool(getline(CPUspresent, line))) {
+                /* The content of this file is a list of ranges or single values, e.g.
+                 * 0-5, or 1-3,5,7 or similar.  As we are interested in the
+                 * max valid ID, we just need to find the last valid
+                 * delimiter ('-' or ',') and parse the integer immediately after that.
+                 */
+                auto startfrom=line.begin();
+
+                for (auto i=line.begin(); i<line.end(); ++i) {
+                    if (*i=='-' || *i==',') {
+                        startfrom=i+1;
+                    }
+                }
+
+                line.erase(line.begin(), startfrom);
+
+                max_cpus = stoi(line, nullptr, 0) + 1;
+                success = true;
+            }
+        }
+
+        // Return std::thread::hardware_concurrency() as a fallback.
+        if (!success) {
+            max_cpus = std::thread::hardware_concurrency();
+        }
+#endif // !BARE_METAL
+
+        return max_cpus;
+    }
+
+public:
+    CPUInfo() {
+#ifndef BARE_METAL
+        unsigned long hwcaps = getauxval(AT_HWCAP);
+
+        if (hwcaps & HWCAP_CPUID) {
+            _cpuid = true;
+        }
+
+        if (hwcaps & HWCAP_ASIMDHP) {
+            _fp16 = true;
+        }
+
+        if (hwcaps & HWCAP_ASIMDDP) {
+            _dotprod = true;
+        }
+
+#ifdef __aarch64__
+        /* Pre-4.15 kernels don't have the ASIMDDP bit.
+         *
+         * Although the CPUID bit allows us to read the feature register
+         * directly, the kernel quite sensibly masks this to only show
+         * features known by it to be safe to show to userspace.  As a
+         * result, pre-4.15 kernels won't show the relevant bit in the
+         * feature registers either.
+         *
+         * So for now, use a whitelist of CPUs known to support the feature.
+         */
+        if (!_dotprod && _cpuid) {
+            /* List of CPUs with dot product support:         A55r1       A75r1       A75r2  */
+            const unsigned int dotprod_whitelist_masks[]  = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+            const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+            unsigned long cpuid;
+
+            __asm __volatile (
+                "mrs %0, midr_el1\n"
+                : "=r" (cpuid)
+                :
+                :
+            );
+
+            for (int i=0;dotprod_whitelist_values[i];i++) {
+                if ((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i]) {
+                    _dotprod = true;
+                    break;
+                }
+            }
+        }
+#endif
+        _percpu.resize(get_max_cpus());
+#endif
+        if (_cpuid) {
+            populate_models_cpuid();
+        } else {
+            populate_models_cpuinfo();
+        }
+    }
+
+    void set_fp16(const bool fp16) {
+        _fp16 = fp16;
+    }
+
+    void set_dotprod(const bool dotprod) {
+        _dotprod = dotprod;
+    }
+
+    void set_cpu_model(unsigned long cpuid, CPUModel model) {
+        if (_percpu.size() > cpuid) {
+            _percpu[cpuid].model     = model;
+            _percpu[cpuid].model_set = true;
+        }
+    }
+
+    bool has_fp16() const {
+        return _fp16;
+    }
+
+    bool has_dotprod() const {
+        return _dotprod;
+    }
+
+    CPUModel get_cpu_model(unsigned long cpuid) const {
+        if (cpuid < _percpu.size()) {
+            return _percpu[cpuid].model;
+        }
+
+        return CPUModel::GENERIC;
+    }
+
+    CPUModel get_cpu_model() const {
+#ifdef BARE_METAL
+        return get_cpu_model(0);
+#else
+        return get_cpu_model(sched_getcpu());
+#endif
+    }
+
+    unsigned int get_L1_cache_size() const {
+        return L1_cache_size;
+    }
+
+    void set_L1_cache_size(unsigned int size) {
+        L1_cache_size = size;
+    }
+
+    unsigned int get_L2_cache_size() const {
+        return L2_cache_size;
+    }
+
+    void set_L2_cache_size(unsigned int size) {
+        L2_cache_size = size;
+    }
+};
+
+CPUInfo *get_CPUInfo();