COMPMID-3259: Fix scalar register allocation

The Aarch64 ABI reserves X18 for platform ABIs, replace all references
to X18 with a different register which doesn't have a special purpose.

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ia9e059d44c5edda216bea169d0418bb7a8c4311b
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2863
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Nikhil Raj Arm <nikhil.raj@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
index 908fc82..e4aad76 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
@@ -44,9 +44,9 @@
   __asm__ __volatile__(
     "ldr q0, [%[pcoeffs]]\n"
     "add x25, %[inptr0], %[input_row_stride]\n"
-    "add x18, %[input_col_stride1], %[input_col_stride1]\n"
+    "add x9, %[input_col_stride1], %[input_col_stride1]\n"
     "add x16, x25, %[input_row_stride]\n"
-    "add x19, x18, %[input_col_stride1]\n"
+    "add x19, x9, %[input_col_stride1]\n"
     "add x26, x16, %[input_row_stride]\n"
     "add x20, x19, %[input_col_stride1]\n"
     "add x17, x26, %[input_row_stride]\n"
@@ -65,7 +65,7 @@
     "blt 2f\n"
     "1:\n"
     "ldr q8, [%[inptr0], x20]\n"
-    "ldr q2, [%[inptr0], x18]\n"
+    "ldr q2, [%[inptr0], x9]\n"
     "mov v14.16b, v8.16b\n"
     "ldr q9, [%[inptr0]]\n"
     "mov v10.16b, v8.16b\n"
@@ -77,7 +77,7 @@
     "fmls v10.4s, v12.4s, v0.s[2]\n"
     "ldr q5, [x16, x20]\n"
     "fmls v14.4s, v2.4s, v0.s[3]\n"
-    "ldr q20, [x16, x18]\n"
+    "ldr q20, [x16, x9]\n"
     "fmla v9.4s, v12.4s, v0.s[2]\n"
     "ldr q3, [x16]\n"
     "fmls v10.4s, v2.4s, v0.s[2]\n"
@@ -89,7 +89,7 @@
     "fadd v10.4s, v10.4s, v4.4s\n"
     "ldr q17, [x17, x20]\n"
     "fmls v7.4s, v12.4s, v0.s[1]\n"
-    "ldr q15, [x17, x18]\n"
+    "ldr q15, [x17, x9]\n"
     "fsub v9.4s, v9.4s, v4.4s\n"
     "ldr q19, [x17]\n"
     "mov v8.16b, v8.16b\n"
@@ -180,7 +180,7 @@
     "mov v25.16b, v19.16b\n"
     "ldr q11, [x25, x20]\n"
     "mov v10.16b, v11.16b\n"
-    "ldr q23, [x25, x18]\n"
+    "ldr q23, [x25, x9]\n"
     "mov v9.16b, v11.16b\n"
     "ldr q7, [x25]\n"
     "fmla v10.4s, v7.4s, v0.s[2]\n"
@@ -192,7 +192,7 @@
     "fmls v10.4s, v23.4s, v0.s[3]\n"
     "ldr q30, [x26, x20]\n"
     "fmls v9.4s, v21.4s, v0.s[2]\n"
-    "ldr q29, [x26, x18]\n"
+    "ldr q29, [x26, x9]\n"
     "fmla v7.4s, v21.4s, v0.s[2]\n"
     "ldr q22, [x26]\n"
     "fmls v8.4s, v21.4s, v0.s[1]\n"
@@ -360,7 +360,7 @@
     "add x14, x14, #16\n"
     "ldr q2, [x27, x20]\n"
     "mov v4.16b, v2.16b\n"
-    "ldr q17, [x27, x18]\n"
+    "ldr q17, [x27, x9]\n"
     "mov v12.16b, v2.16b\n"
     "ldr q18, [x27]\n"
     "fmla v4.4s, v18.4s, v0.s[2]\n"
@@ -420,7 +420,7 @@
     "blt 3f\n"
     "ldr d8, [%[inptr0], x20]\n"
     "mov v14.16b, v8.16b\n"
-    "ldr d2, [%[inptr0], x18]\n"
+    "ldr d2, [%[inptr0], x9]\n"
     "mov v10.16b, v8.16b\n"
     "ldr d9, [%[inptr0]]\n"
     "fmla v14.4s, v9.4s, v0.s[2]\n"
@@ -432,7 +432,7 @@
     "fmls v14.4s, v2.4s, v0.s[3]\n"
     "ldr d5, [x16, x20]\n"
     "fmls v10.4s, v12.4s, v0.s[2]\n"
-    "ldr d20, [x16, x18]\n"
+    "ldr d20, [x16, x9]\n"
     "fmla v9.4s, v12.4s, v0.s[2]\n"
     "ldr d3, [x16]\n"
     "fmls v7.4s, v12.4s, v0.s[1]\n"
@@ -444,7 +444,7 @@
     "fsub v7.4s, v7.4s, v2.4s\n"
     "ldr d17, [x17, x20]\n"
     "fadd v10.4s, v10.4s, v4.4s\n"
-    "ldr d15, [x17, x18]\n"
+    "ldr d15, [x17, x9]\n"
     "fsub v9.4s, v9.4s, v4.4s\n"
     "ldr d19, [x17]\n"
     "fmla v7.4s, v4.4s, v0.s[1]\n"
@@ -534,7 +534,7 @@
     "mov v25.16b, v19.16b\n"
     "ldr d11, [x25, x20]\n"
     "mov v10.16b, v11.16b\n"
-    "ldr d23, [x25, x18]\n"
+    "ldr d23, [x25, x9]\n"
     "mov v9.16b, v11.16b\n"
     "ldr d7, [x25]\n"
     "fmla v10.4s, v7.4s, v0.s[2]\n"
@@ -546,7 +546,7 @@
     "fmls v10.4s, v23.4s, v0.s[3]\n"
     "ldr d30, [x26, x20]\n"
     "fmls v9.4s, v21.4s, v0.s[2]\n"
-    "ldr d29, [x26, x18]\n"
+    "ldr d29, [x26, x9]\n"
     "fmla v7.4s, v21.4s, v0.s[2]\n"
     "ldr d22, [x26]\n"
     "fmls v8.4s, v21.4s, v0.s[1]\n"
@@ -714,7 +714,7 @@
     "add x14, x14, #8\n"
     "ldr d2, [x27, x20]\n"
     "mov v4.16b, v2.16b\n"
-    "ldr d17, [x27, x18]\n"
+    "ldr d17, [x27, x9]\n"
     "mov v12.16b, v2.16b\n"
     "ldr d18, [x27]\n"
     "fmla v4.4s, v18.4s, v0.s[2]\n"
@@ -771,7 +771,7 @@
     "cbz %w[n_channels], 4f\n"
     "ldr s8, [%[inptr0], x20]\n"
     "mov v14.16b, v8.16b\n"
-    "ldr s2, [%[inptr0], x18]\n"
+    "ldr s2, [%[inptr0], x9]\n"
     "mov v10.16b, v8.16b\n"
     "ldr s9, [%[inptr0]]\n"
     "fmla v14.4s, v9.4s, v0.s[2]\n"
@@ -783,7 +783,7 @@
     "fmls v14.4s, v2.4s, v0.s[3]\n"
     "ldr s5, [x16, x20]\n"
     "fmls v10.4s, v12.4s, v0.s[2]\n"
-    "ldr s20, [x16, x18]\n"
+    "ldr s20, [x16, x9]\n"
     "fmla v9.4s, v12.4s, v0.s[2]\n"
     "ldr s3, [x16]\n"
     "fmls v7.4s, v12.4s, v0.s[1]\n"
@@ -795,7 +795,7 @@
     "fsub v7.4s, v7.4s, v2.4s\n"
     "ldr s17, [x17, x20]\n"
     "fadd v10.4s, v10.4s, v4.4s\n"
-    "ldr s15, [x17, x18]\n"
+    "ldr s15, [x17, x9]\n"
     "fsub v9.4s, v9.4s, v4.4s\n"
     "ldr s19, [x17]\n"
     "fmla v7.4s, v4.4s, v0.s[1]\n"
@@ -885,7 +885,7 @@
     "mov v25.16b, v19.16b\n"
     "ldr s11, [x25, x20]\n"
     "mov v10.16b, v11.16b\n"
-    "ldr s23, [x25, x18]\n"
+    "ldr s23, [x25, x9]\n"
     "mov v9.16b, v11.16b\n"
     "ldr s7, [x25]\n"
     "fmla v10.4s, v7.4s, v0.s[2]\n"
@@ -897,7 +897,7 @@
     "fmls v10.4s, v23.4s, v0.s[3]\n"
     "ldr s30, [x26, x20]\n"
     "fmls v9.4s, v21.4s, v0.s[2]\n"
-    "ldr s29, [x26, x18]\n"
+    "ldr s29, [x26, x9]\n"
     "fmla v7.4s, v21.4s, v0.s[2]\n"
     "ldr s22, [x26]\n"
     "fmls v8.4s, v21.4s, v0.s[1]\n"
@@ -1065,7 +1065,7 @@
     "add x14, x14, #4\n"
     "ldr s2, [x27, x20]\n"
     "mov v4.16b, v2.16b\n"
-    "ldr s17, [x27, x18]\n"
+    "ldr s17, [x27, x9]\n"
     "mov v12.16b, v2.16b\n"
     "ldr s18, [x27]\n"
     "fmla v4.4s, v18.4s, v0.s[2]\n"
@@ -1129,7 +1129,7 @@
     : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
       "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
       "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8",
-      "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19",
+      "v9", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x9", "x19",
       "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
   );
 }