MLECO-1860: Support for Arm GNU Embedded Toolchain

This patch enables compilation of ML use cases bare-metal applications
using Arm GNU Embedded Toolchain. The GNU toolchain can be used instead
of the Arm Compiler that was already supported.

The GNU toolchain is also set as the default toolchain when building
applications for the MPS3 target.

Note: The version of GNU toolchain must be 10.2.1 or higher.

Change-Id: I5fff242f0f52d2db6c75d292f9fa990df1aec978
Signed-off-by: Kshitij Sisodia <kshitij.sisodia@arm.com>
diff --git a/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.ld b/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.ld
new file mode 100644
index 0000000..8bb99cd
--- /dev/null
+++ b/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.ld
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+__STACK_SIZE = 0x00060000;
+__HEAP_SIZE  = 0x000f0000;
+
+/* System memory brief */
+MEMORY
+{
+  ITCM  (rx)  : ORIGIN = 0x00000000, LENGTH = 0x00080000
+  DTCM  (rwx) : ORIGIN = 0x20000000, LENGTH = 0x00080000
+  BRAM  (rwx) : ORIGIN = 0x11000000, LENGTH = 0x00200000
+  SRAM  (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00400000
+  DDR   (rwx) : ORIGIN = 0x70000000, LENGTH = 0x02000000
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions ITCM and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text.at_itcm :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    KEEP(*(.eh_frame*))
+  } > ITCM
+
+  .ARM.extab.at_itcm :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ITCM
+
+  __exidx_start = .;
+  .ARM.exidx.at_itcm :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ITCM
+  __exidx_end = .;
+
+  .zero.table.at_itcm :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+    LONG (__bss_start__)
+    LONG ((__bss_end__ - __bss_start__)/4) /* Size is in 32-bit words */
+
+    __zero_table_end__ = .;
+  } > ITCM
+
+  .copy.table.at_itcm :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    /* Section to be copied - part 1: any data to be placed in BRAM */
+    LONG (__etext)
+    LONG (__data_start__)
+    LONG ((__data_end__ - __data_start__)/4) /* Size is in 32-bit words */
+
+    /* Section to be copied - part 2: RO data for for DTCM */
+    LONG (__etext2)
+    LONG (__ro_data_start__)
+    LONG ((__ro_data_end__ - __ro_data_start__)/4) /* Size is in 32-bit words */
+
+    __copy_table_end__ = .;
+  } > ITCM
+
+  __itcm_total = ALIGN(4);
+
+  ASSERT( __itcm_total < (ORIGIN(ITCM) + LENGTH(ITCM)), "ITCM overflow")
+
+  .sram :
+  {
+    . = ALIGN(16);
+    *(.bss.NoInit.activation_buf)
+    . = ALIGN(16);
+  } > SRAM AT > SRAM
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > DTCM AT > DTCM
+
+  .stack (ORIGIN(DTCM) + LENGTH(DTCM) - __STACK_SIZE) (COPY) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > DTCM
+  PROVIDE(__stack = __StackTop);
+  ASSERT(
+    (__STACK_SIZE + __bss_end__ - __bss_start__) <= LENGTH(DTCM),
+    "DTCM overflow")
+
+  .ddr.at_ddr :
+  {
+    /* __attribute__((aligned(16))) is not handled by the CMSIS startup code.
+     * Force the alignment here as a workaround */
+    . = ALIGN(16);
+    *(ifm)
+    . = ALIGN(16);
+    *(nn_model)
+    . = ALIGN (16);
+    *(labels)
+    . = ALIGN (16);
+    *(activation_buf)
+    . = ALIGN (16);
+  } > DDR AT > DDR
+
+  /**
+   * Location counter can end up 2byte aligned with narrow Thumb code but
+   * __etext is assumed by startup code to be the LMA of a section in DTCM
+   * which must be 4byte aligned
+   */
+  __etext = ALIGN (4);
+
+  .bram.at_ddr :  AT (__etext)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+    . = ALIGN(4);
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+    . = ALIGN(4);
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+    . = ALIGN(4);
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+
+    __data_end__ = .;
+  } > BRAM
+
+  __etext2 = __etext + (__data_end__ - __data_start__);
+
+  .data.at_ddr : AT (__etext2)
+  {
+    . = ALIGN(4);
+    __ro_data_start__ = .;
+
+    *(.rodata*)
+    . = ALIGN(4);
+    * (npu_driver_version)
+    . = ALIGN(4);
+    * (npu_driver_arch_version)
+    . = ALIGN(4);
+
+    __ro_data_end__ = .;
+  } > BRAM
+
+  .heap (COPY) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > BRAM
+
+  ASSERT (
+      (__ro_data_end__ - __ro_data_start__)
+    + (__data_end__  - __data_start__)
+    + __HEAP_SIZE <= LENGTH(BRAM),
+    "BRAM overflow")
+}
diff --git a/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.sct b/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.sct
index 327d511..55ed5d7 100644
--- a/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.sct
+++ b/source/application/hal/platforms/bare-metal/bsp/mem_layout/mps3-sse-300.sct
@@ -78,12 +78,12 @@
 LOAD_REGION_1       0x70000000                  0x02000000
 {
     ;-----------------------------------------------------
-    ; 32 MiB of DRAM space for neural network model,
+    ; 32 MiB of DDR space for neural network model,
     ; input vectors and labels. If the activation buffer
     ; size required by the network is bigger than the
     ; SRAM size available, it is accommodated here.
     ;-----------------------------------------------------
-    dram.bin        0x70000000 ALIGN 16         0x02000000
+    ddr.bin        0x70000000 ALIGN 16         0x02000000
     {
         ; nn model's baked in input matrices
         *.o (ifm)
@@ -110,9 +110,9 @@
     }
 
     ;-----------------------------------------------------
-    ; Remaining part of the 2MiB BRAM used as heap space.
-    ; 0x00200000 - 0x00040000 = 0x001C0000 (1.75 MiB)
+    ; 960 KiB of remaining part of the 2MiB BRAM used as
+    ; heap space. 0x000F0000 of 0x0x001C0000 available.
     ;-----------------------------------------------------
-    ARM_LIB_HEAP    0x11040000 EMPTY ALIGN 8    0x001C0000
+    ARM_LIB_HEAP    0x11040000 EMPTY ALIGN 8    0x000F0000
     {}
 }
diff --git a/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.ld b/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.ld
new file mode 100644
index 0000000..8bb99cd
--- /dev/null
+++ b/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.ld
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+__STACK_SIZE = 0x00060000;
+__HEAP_SIZE  = 0x000f0000;
+
+/* System memory brief */
+MEMORY
+{
+  ITCM  (rx)  : ORIGIN = 0x00000000, LENGTH = 0x00080000
+  DTCM  (rwx) : ORIGIN = 0x20000000, LENGTH = 0x00080000
+  BRAM  (rwx) : ORIGIN = 0x11000000, LENGTH = 0x00200000
+  SRAM  (rwx) : ORIGIN = 0x31000000, LENGTH = 0x00400000
+  DDR   (rwx) : ORIGIN = 0x70000000, LENGTH = 0x02000000
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions ITCM and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  .text.at_itcm :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    KEEP(*(.eh_frame*))
+  } > ITCM
+
+  .ARM.extab.at_itcm :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ITCM
+
+  __exidx_start = .;
+  .ARM.exidx.at_itcm :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ITCM
+  __exidx_end = .;
+
+  .zero.table.at_itcm :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+
+    LONG (__bss_start__)
+    LONG ((__bss_end__ - __bss_start__)/4) /* Size is in 32-bit words */
+
+    __zero_table_end__ = .;
+  } > ITCM
+
+  .copy.table.at_itcm :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+
+    /* Section to be copied - part 1: any data to be placed in BRAM */
+    LONG (__etext)
+    LONG (__data_start__)
+    LONG ((__data_end__ - __data_start__)/4) /* Size is in 32-bit words */
+
+    /* Section to be copied - part 2: RO data for for DTCM */
+    LONG (__etext2)
+    LONG (__ro_data_start__)
+    LONG ((__ro_data_end__ - __ro_data_start__)/4) /* Size is in 32-bit words */
+
+    __copy_table_end__ = .;
+  } > ITCM
+
+  __itcm_total = ALIGN(4);
+
+  ASSERT( __itcm_total < (ORIGIN(ITCM) + LENGTH(ITCM)), "ITCM overflow")
+
+  .sram :
+  {
+    . = ALIGN(16);
+    *(.bss.NoInit.activation_buf)
+    . = ALIGN(16);
+  } > SRAM AT > SRAM
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > DTCM AT > DTCM
+
+  .stack (ORIGIN(DTCM) + LENGTH(DTCM) - __STACK_SIZE) (COPY) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > DTCM
+  PROVIDE(__stack = __StackTop);
+  ASSERT(
+    (__STACK_SIZE + __bss_end__ - __bss_start__) <= LENGTH(DTCM),
+    "DTCM overflow")
+
+  .ddr.at_ddr :
+  {
+    /* __attribute__((aligned(16))) is not handled by the CMSIS startup code.
+     * Force the alignment here as a workaround */
+    . = ALIGN(16);
+    *(ifm)
+    . = ALIGN(16);
+    *(nn_model)
+    . = ALIGN (16);
+    *(labels)
+    . = ALIGN (16);
+    *(activation_buf)
+    . = ALIGN (16);
+  } > DDR AT > DDR
+
+  /**
+   * Location counter can end up 2byte aligned with narrow Thumb code but
+   * __etext is assumed by startup code to be the LMA of a section in DTCM
+   * which must be 4byte aligned
+   */
+  __etext = ALIGN (4);
+
+  .bram.at_ddr :  AT (__etext)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+    . = ALIGN(4);
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+    . = ALIGN(4);
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+    . = ALIGN(4);
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+
+    __data_end__ = .;
+  } > BRAM
+
+  __etext2 = __etext + (__data_end__ - __data_start__);
+
+  .data.at_ddr : AT (__etext2)
+  {
+    . = ALIGN(4);
+    __ro_data_start__ = .;
+
+    *(.rodata*)
+    . = ALIGN(4);
+    * (npu_driver_version)
+    . = ALIGN(4);
+    * (npu_driver_arch_version)
+    . = ALIGN(4);
+
+    __ro_data_end__ = .;
+  } > BRAM
+
+  .heap (COPY) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > BRAM
+
+  ASSERT (
+      (__ro_data_end__ - __ro_data_start__)
+    + (__data_end__  - __data_start__)
+    + __HEAP_SIZE <= LENGTH(BRAM),
+    "BRAM overflow")
+}
diff --git a/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.sct b/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.sct
index a1ffb49..deb4214 100644
--- a/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.sct
+++ b/source/application/hal/platforms/bare-metal/bsp/mem_layout/simple_platform.sct
@@ -36,35 +36,36 @@
     }
 
     ;-----------------------------------------------------
-    ; BRAM or FPGA data SRAM region worth 2MiB
-    ;-----------------------------------------------------
-    bram.bin        0x11000000  UNINIT ALIGN 16 0x00200000
-    {
-        ; activation buffers a.k.a tensor arena
-        *.o (.bss.NoInit.activation_buf)
-    }
-
-    ;-----------------------------------------------------
-    ; 128kiB of 512kiB bank is used for any other RW or ZI
+    ; 128kiB of 512kiB DTCM is used for any other RW or ZI
     ; data. Note: this region is internal to the Cortex-M
-    ; CPU
+    ; CPU.
     ;-----------------------------------------------------
     dtcm.bin        0x20000000                  0x00020000
     {
+        ; Any R/W and/or zero initialised data
         .ANY(+RW +ZI)
     }
 
     ;-----------------------------------------------------
-    ; 128kiB of stack space within the DTCM region
+    ; 384kiB of stack space within the DTCM region. See
+    ; `dtcm.bin` for the first section. Note: by virtue of
+    ; being part of DTCM, this region is only accessible
+    ; from Cortex-M55.
     ;-----------------------------------------------------
-    ARM_LIB_STACK   0x20020000 EMPTY ALIGN 8    0x00020000
+    ARM_LIB_STACK   0x20020000 EMPTY ALIGN 8    0x00060000
     {}
 
     ;-----------------------------------------------------
-    ; 256kiB of heap space within the DTCM region
+    ; SSE-300's internal SRAM of 4MiB - reserved for
+    ; activation buffers.
+    ; This region should have 3 cycle read latency from
+    ; both Cortex-M55 and Ethos-U55
     ;-----------------------------------------------------
-    ARM_LIB_HEAP    0x20040000 EMPTY ALIGN 8    0x00040000
-    {}
+    isram.bin       0x31000000  UNINIT ALIGN 16 0x00400000
+    {
+        ; activation buffers a.k.a tensor arena
+        *.o (.bss.NoInit.activation_buf)
+    }
 }
 
 ;---------------------------------------------------------
@@ -73,9 +74,12 @@
 LOAD_REGION_1       0x70000000                  0x02000000
 {
     ;-----------------------------------------------------
-    ; 32 MiB of DRAM space for nn model and input vectors
+    ; 32 MiB of DDR space for neural network model,
+    ; input vectors and labels. If the activation buffer
+    ; size required by the network is bigger than the
+    ; SRAM size available, it is accommodated here.
     ;-----------------------------------------------------
-    dram.bin        0x70000000 ALIGN 16         0x02000000
+    ddr.bin        0x70000000 ALIGN 16         0x02000000
     {
         ; nn model's baked in input matrices
         *.o (ifm)
@@ -83,20 +87,28 @@
         ; nn model
         *.o (nn_model)
 
+        ; labels
+        *.o (labels)
+
         ; if the activation buffer (tensor arena) doesn't
         ; fit in the SRAM region, we accommodate it here
         *.o (activation_buf)
     }
 
     ;-----------------------------------------------------
-    ; SSE-300's internal SRAM of 2MiB - reserved for
-    ; activation buffers.
-    ; This region should have 3 cycle read latency from
-    ; both Cortex-M55 and Ethos-U55
+    ; First 256kiB of BRAM (FPGA SRAM) used for RO data.
+    ; Note: Total BRAM size available is 2MiB.
     ;-----------------------------------------------------
-    isram.bin       0x31000000                  0x00080000
+    bram.bin        0x11000000          ALIGN 8 0x00040000
     {
         ; RO data (incl. unwinding tables for debugging)
         .ANY (+RO-DATA)
     }
+
+    ;-----------------------------------------------------
+    ; 960 KiB of remaining part of the 2MiB BRAM used as
+    ; heap space. 0x000F0000 of 0x0x001C0000 available.
+    ;-----------------------------------------------------
+    ARM_LIB_HEAP    0x11040000 EMPTY ALIGN 8    0x000F0000
+    {}
 }