COMPMID-481: Add gemmlowp_aarch64_v8p4 kernel.

Change-Id: I15496b16ffd636f5bff76572e750df7e15c80830
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/90532
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/tests/validation/CPP/GEMMInterleaveBlocked.h b/tests/validation/CPP/GEMMInterleaveBlocked.h
new file mode 100644
index 0000000..ff5a0d6
--- /dev/null
+++ b/tests/validation/CPP/GEMMInterleaveBlocked.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GEMM.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/FixedPoint.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+T safe_read(const SimpleTensor<T> &t, int y, int x)
+{
+    const int stride = t.shape().x();
+    const int M      = t.shape().y();
+    const int N      = t.shape().x();
+    if((y < M) && (x < N))
+    {
+        return t[y * stride + x];
+    }
+    return 0;
+}
+
+template <typename T>
+SimpleTensor<T> gemm_interleave_blocked(const SimpleTensor<T> &in, SimpleTensor<T> &out, int int_by, int block, bool transposed)
+{
+    const int M = out.shape().y();
+    const int N = out.shape().x();
+    for(int y = 0; y < M; y++)
+    {
+        T *out_ptr = &out[y * N];
+        for(int x = 0; x < (N / int_by); x += block)
+        {
+            for(int z = 0; z < int_by; z++)
+            {
+                for(int a = 0; (out_ptr <= &out[y * N + (N - 1)]) && a < block; a++)
+                {
+                    if(!transposed)
+                        *out_ptr++ = safe_read(in, (y * int_by) + z, x + a);
+                    else
+                    {
+                        const T value = safe_read(in, x + a, (y * int_by) + z);
+                        *out_ptr++    = value;
+                    }
+                }
+            }
+        }
+    }
+    return out;
+}
+
+template SimpleTensor<uint8_t> gemm_interleave_blocked(const SimpleTensor<uint8_t> &in, SimpleTensor<uint8_t> &out, int int_by, int block, bool transposed);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp
index d172a77..06926e6 100644
--- a/tests/validation/CPP/GEMMLowp.cpp
+++ b/tests/validation/CPP/GEMMLowp.cpp
@@ -34,6 +34,42 @@
 {
 namespace reference
 {
+SimpleTensor<uint32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, SimpleTensor<uint32_t> &c)
+{
+    ARM_COMPUTE_UNUSED(a);
+    ARM_COMPUTE_UNUSED(b);
+    ARM_COMPUTE_UNUSED(c);
+    const int            K       = a.shape().x();
+    const int            b_width = b.shape().x();
+    const int            rows    = c.shape().y(); //M
+    const int            cols    = c.shape().x(); //N
+    std::vector<int32_t> acc;
+    acc.resize(cols);
+    for(int i = 0; i < rows; ++i)
+    {
+        for(int j = 0; j < cols; ++j)
+        {
+            acc[j] = 0;
+        }
+        for(int k = 0; k < K; ++k)
+        {
+            auto tmp_a = static_cast<int32_t>(a[k + i * K]);
+            for(int j = 0; j < b_width; ++j)
+            {
+                auto          tmp_b       = static_cast<int32_t>(b[j + k * b_width]);
+                const int32_t mult_as_int = tmp_a * tmp_b;
+                acc[j] += mult_as_int;
+            }
+        }
+        for(int j = 0; j < cols; ++j)
+        {
+            c[j + i * cols] = acc[j];
+        }
+    }
+
+    return c;
+}
+
 template <typename T>
 SimpleTensor<T> gemmlowp(const SimpleTensor<T> &a, const SimpleTensor<T> &b, SimpleTensor<T> &c,
                          int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift)
diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h
index 2160975..0428e9e 100644
--- a/tests/validation/CPP/GEMMLowp.h
+++ b/tests/validation/CPP/GEMMLowp.h
@@ -35,6 +35,8 @@
 {
 namespace reference
 {
+SimpleTensor<uint32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, SimpleTensor<uint32_t> &c);
+
 template <typename T>
 SimpleTensor<T> gemmlowp(const SimpleTensor<T> &a, const SimpleTensor<T> &b, SimpleTensor<T> &c,
                          int32_t a_offset, int32_t b_offset, int32_t c_offset, int32_t c_mult_int, int32_t out_shift);