COMPMID-1369: Revert accidental formatting of RSH's repo

Pulled latest fixes from David's repo:

commit f43ebe932c84083332b0b1a0348241b69dda63a7
Author: David Mansell <David.Mansell@arm.com>
Date:   Tue Jul 3 18:09:01 2018 +0100

    Whitespace tidying, fixed comment in gemv_batched imported from ACL.

Change-Id: Ie37a623f44e90d88072236cb853ac55ac82d5f51
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/138530
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: David Mansell <david.mansell@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index 5434599..b6565ba 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -31,91 +31,100 @@
 template <>
 template <typename T>
 inline void TransformImpl<12, 1, true, 4, 4>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a 24 x uint16_t specialisation
-    TransformImpl<24, 1, true, 2, 2>::Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride * 2, x0 * 2, xmax * 2, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a 24 x uint16_t specialisation
+  TransformImpl<24, 1, true, 2, 2>::Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride*2, x0*2, xmax*2, k0, kmax
+  );
 }
 
 // Generic 24x16-bit sized specialisation
 template <>
 template <typename T>
 inline void TransformImpl<24, 1, true, 2, 2>::Transform(
-    T *out, const T *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    // Redirect to a uint16_t specialisation
-    Transform(
-        reinterpret_cast<uint16_t *>(out),
-        reinterpret_cast<const uint16_t *const>(in),
-        stride, x0, xmax, k0, kmax);
+    T* out, const T* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  // Redirect to a uint16_t specialisation
+  Transform(
+    reinterpret_cast<uint16_t *>(out),
+    reinterpret_cast<const uint16_t * const>(in),
+    stride, x0, xmax, k0, kmax
+  );
 }
 
 // Specialised 24 x uint16_t version
 template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
-{
-    __asm __volatile(
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "STR    q2, [%[out], #32]\n"
-        : [in0] "+r"(in0), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "memory");
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
-{
-    __asm __volatile(
-        "LDP    q0, q1, [%[in0]], #32\n"
-        "STP    q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDR    q2, [%[in0]], #16\n"
-        "LDP    q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
-        "LDR    q5, [%[in1]], #16\n"
-        "STP    q4, q5, [%[out], #64]\n"
-        : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
-}
-
-template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
-{
-    __asm __volatile(
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
+    __asm __volatile (
         "LDP    q0, q1, [%[in0]], #32\n"
         "STP    q0, q1, [%[out]]\n"
-        "LDR    q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]")
-        "LDP    q3, q4, [%[in1]], #32\n"
-        "STP    q2, q3, [%[out], #32]\n"
-        "LDR    q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]")
-        "STP    q4, q5, [%[out], #64]\n"
-        "LDP    q6, q7, [%[in2]], #32\n"
-        "STP    q6, q7, [%[out], #96]\n"
-        "LDR    q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]")
-        "LDP    q9, q10, [%[in3]], #32\n"
-        "STP    q8, q9, [%[out], #128]\n"
-        "LDR    q11, [%[in3]], #16\n"
-        "STP    q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "STR    q2, [%[out], #32]\n"
+    : [in0] "+r" (in0), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "memory"
+    );
+}
 
-        : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDR    q2, [%[in0]], #16\n"
+        "LDP	q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "LDR	q5, [%[in1]], #16\n"
+        "STP    q4, q5, [%[out], #64]\n"
+    : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
+    );
+}
+
+template <>
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
+    __asm __volatile (
+        "LDP    q0, q1, [%[in0]], #32\n"
+        "STP    q0, q1, [%[out]]\n"
+        "LDR    q2, [%[in0]], #16\n"
+        ASM_PREFETCH("[%[in0], #192]")
+        "LDP	q3, q4, [%[in1]], #32\n"
+        "STP    q2, q3, [%[out], #32]\n"
+        "LDR	q5, [%[in1]], #16\n"
+        ASM_PREFETCH("[%[in1], #192]")
+        "STP    q4, q5, [%[out], #64]\n"
+        "LDP	q6, q7, [%[in2]], #32\n"
+        "STP    q6, q7, [%[out], #96]\n"
+        "LDR	q8, [%[in2]], #16\n"
+        ASM_PREFETCH("[%[in2], #192]")
+        "LDP	q9, q10, [%[in3]], #32\n"
+        "STP    q8, q9, [%[out], #128]\n"
+        "LDR	q11, [%[in3]], #16\n"
+        "STP    q10, q11, [%[out], #160]\n"
+        ASM_PREFETCH("[%[in3], #192]")
+
+    : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
+    :
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
+    );
 }
 
 template <>
 template <>
 inline void TransformImpl<24, 1, true, 2, 2>::Transform(
-    uint16_t *out, const uint16_t *const in, const int stride,
-    const int x0, const int xmax, const int k0, const int kmax)
-{
-    TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+    uint16_t* out, const uint16_t* const in, const int stride,
+    const int x0, const int xmax, const int k0, const int kmax
+) {
+  TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
 }
 
-#endif // __arch64__
+#endif  // __arch64__