Blame - src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp - ml/ComputeLibrary

blob: dc77d0c45025f2eafbc18c2bf629a439f5b744d0 [file] [log] [blame]

Gunes Bayir	ae72a46	2023-01-29 13:24:24 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
Matthew Bentham	f1aeab9	2023-05-30 13:35:34 +0000	[diff] [blame^]	25	#include "arm_compute/core/ActivationLayerInfo.h"
Gunes Bayir	ae72a46	2023-01-29 13:24:24 +0000	[diff] [blame]	26	#include "arm_compute/core/Helpers.h"
				27	#include "arm_compute/core/ITensor.h"
				28	#include "arm_compute/core/QuantizationInfo.h"
				29	#include "arm_compute/core/Types.h"
				30	#include "arm_compute/core/Window.h"
				31
				32	#include <cstddef>
				33	#include <cstdint>
				34	#include <limits>
				35
				36	#ifdef __aarch64__
				37	namespace
				38	{
				39	void a64_add_bn_clamp_direct_u8_fp32_2x16(
				40	uint8_t *out, size_t out_stride,
				41	uint8_t *out_direct, size_t out_direct_stride,
				42	const uint8_t *in0, size_t in0_stride,
				43	const uint8_t *in1, size_t in1_stride,
				44	const float *bn_mul,
				45	const float *bn_add,
				46	const uint8_t minval,
				47	const uint8_t maxval,
				48	int32_t out_zeropt, float out_scale,
				49	int32_t out_direct_zeropt, float out_direct_scale,
				50	int32_t in0_zeropt, float in0_scale,
				51	int32_t in1_zeropt, float in1_scale,
				52	size_t width, size_t height)
				53	{
				54	float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
				55	struct KernelArgs
				56	{
				57	const float *scales;
				58	int32_t in0_zeropt;
				59	int32_t in1_zeropt;
				60	int32_t out_zeropt;
				61	int32_t out_direct_zeropt;
				62	int32_t minval;
				63	int32_t maxval;
				64	} ka;
				65	ka.scales = scales;
				66	ka.in0_zeropt = in0_zeropt;
				67	ka.in1_zeropt = in1_zeropt;
				68	ka.out_zeropt = out_zeropt;
				69	ka.out_direct_zeropt = out_direct_zeropt;
				70	ka.minval = minval;
				71	ka.maxval = maxval;
				72
				73	__asm__ __volatile__(
				74	"ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
				75	"ld1 { v0.4s }, [x20]\n"
				76	"cmp %x[width], #0x10\n"
				77	"blt 5f\n"
				78	"1:" // Column loop
				79	"ldr q24, [%x[bn_mul], #0x0]\n"
				80	"ldr q25, [%x[bn_mul], #0x10]\n"
				81	"mov x23, %x[height]\n"
				82	"mov x12, %x[in0]\n"
				83	"ldr q26, [%x[bn_mul], #0x20]\n"
				84	"ldr q27, [%x[bn_mul], #0x30]\n"
				85	"mov x11, %x[in1]\n"
				86	"mov x10, %x[out]\n"
				87	"ldr q28, [%x[bn_add], #0x0]\n"
				88	"ldr q29, [%x[bn_add], #0x10]\n"
				89	"mov x9, %x[out_direct]\n"
				90	"add %x[bn_mul], %x[bn_mul], #0x40\n"
				91	"ldr q30, [%x[bn_add], #0x20]\n"
				92	"ldr q31, [%x[bn_add], #0x30]\n"
				93	"add %x[bn_add], %x[bn_add], #0x40\n"
				94	"2:" // Row loop
				95	"mov x28, x12\n"
				96	"ldr d4, [x28, #0x0]\n"
				97	"ldr d3, [x28, #0x8]\n"
				98	"add x21, x28, %x[in0_stride]\n"
				99	"mov x27, x11\n"
				100	"ldr d13, [x27, #0x0]\n"
				101	"ldr d12, [x27, #0x8]\n"
				102	"cmp x23, #0x2\n"
				103	"add x12, x21, %x[in0_stride]\n"
				104	"csel x21, x21, x28, GE\n"
				105	"ldr d2, [x21, #0x0]\n"
				106	"ldr d11, [x21, #0x8]\n"
				107	"add x20, x27, %x[in1_stride]\n"
				108	"add x11, x20, %x[in1_stride]\n"
				109	"ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
				110	"ushll v4.8h, v4.8b, #0x0\n"
				111	"csel x20, x20, x27, GE\n"
				112	"ldr d10, [x20, #0x0]\n"
				113	"ldr d9, [x20, #0x8]\n"
				114	"ushll v3.8h, v3.8b, #0x0\n"
				115	"ushll v2.8h, v2.8b, #0x0\n"
				116	"ushll v11.8h, v11.8b, #0x0\n"
				117	"ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
				118	"mov x26, x10\n"
				119	"dup v16.8h, w21\n"
				120	"ushll v13.8h, v13.8b, #0x0\n"
				121	"mov x25, x9\n"
				122	"add x24, x26, %x[out_stride]\n"
				123	"ushll v12.8h, v12.8b, #0x0\n"
				124	"ushll v10.8h, v10.8b, #0x0\n"
				125	"add x22, x25, %x[out_direct_stride]\n"
				126	"add x10, x24, %x[out_stride]\n"
				127	"ushll v9.8h, v9.8b, #0x0\n"
				128	"ssubl v1.4s, v4.4h, v16.4h\n"
				129	"add x9, x22, %x[out_direct_stride]\n"
				130	"csel x24, x24, x26, GE\n"
				131	"ssubl2 v4.4s, v4.8h, v16.8h\n"
				132	"ssubl v23.4s, v3.4h, v16.4h\n"
				133	"csel x22, x22, x25, GE\n"
				134	"ssubl2 v3.4s, v3.8h, v16.8h\n"
				135	"ssubl v22.4s, v2.4h, v16.4h\n"
				136	"ssubl2 v2.4s, v2.8h, v16.8h\n"
				137	"ssubl v21.4s, v11.4h, v16.4h\n"
				138	"ssubl2 v11.4s, v11.8h, v16.8h\n"
				139	"dup v20.8h, w20\n"
				140	"ssubl v19.4s, v13.4h, v20.4h\n"
				141	"ssubl2 v13.4s, v13.8h, v20.8h\n"
				142	"ssubl v18.4s, v12.4h, v20.4h\n"
				143	"ssubl2 v12.4s, v12.8h, v20.8h\n"
				144	"ssubl v17.4s, v10.4h, v20.4h\n"
				145	"ssubl2 v10.4s, v10.8h, v20.8h\n"
				146	"ssubl v16.4s, v9.4h, v20.4h\n"
				147	"ssubl2 v9.4s, v9.8h, v20.8h\n"
				148	"scvtf v8.4s, v1.4s\n"
				149	"scvtf v7.4s, v4.4s\n"
				150	"scvtf v6.4s, v23.4s\n"
				151	"scvtf v5.4s, v3.4s\n"
				152	"scvtf v4.4s, v22.4s\n"
				153	"scvtf v3.4s, v2.4s\n"
				154	"scvtf v2.4s, v21.4s\n"
				155	"scvtf v1.4s, v11.4s\n"
				156	"scvtf v19.4s, v19.4s\n"
				157	"fmul v8.4s, v8.4s, v0.s[0]\n"
				158	"fmla v8.4s, v19.4s, v0.s[1]\n"
				159	"scvtf v13.4s, v13.4s\n"
				160	"fmul v7.4s, v7.4s, v0.s[0]\n"
				161	"fmla v7.4s, v13.4s, v0.s[1]\n"
				162	"scvtf v18.4s, v18.4s\n"
				163	"fmul v6.4s, v6.4s, v0.s[0]\n"
				164	"fmla v6.4s, v18.4s, v0.s[1]\n"
				165	"scvtf v12.4s, v12.4s\n"
				166	"fmul v5.4s, v5.4s, v0.s[0]\n"
				167	"fmla v5.4s, v12.4s, v0.s[1]\n"
				168	"scvtf v17.4s, v17.4s\n"
				169	"fmul v4.4s, v4.4s, v0.s[0]\n"
				170	"fmla v4.4s, v17.4s, v0.s[1]\n"
				171	"scvtf v10.4s, v10.4s\n"
				172	"fmul v3.4s, v3.4s, v0.s[0]\n"
				173	"fmla v3.4s, v10.4s, v0.s[1]\n"
				174	"scvtf v16.4s, v16.4s\n"
				175	"fmul v2.4s, v2.4s, v0.s[0]\n"
				176	"fmla v2.4s, v16.4s, v0.s[1]\n"
				177	"scvtf v9.4s, v9.4s\n"
				178	"fmul v1.4s, v1.4s, v0.s[0]\n"
				179	"fmla v1.4s, v9.4s, v0.s[1]\n"
				180	"cbz %x[out_direct], 3f\n"
				181	"fmul v23.4s, v8.4s, v0.s[3]\n"
				182	"fmul v22.4s, v7.4s, v0.s[3]\n"
				183	"ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
				184	"fmul v21.4s, v6.4s, v0.s[3]\n"
				185	"fmul v20.4s, v5.4s, v0.s[3]\n"
				186	"fmul v19.4s, v4.4s, v0.s[3]\n"
				187	"fmul v18.4s, v3.4s, v0.s[3]\n"
				188	"fmul v16.4s, v2.4s, v0.s[3]\n"
				189	"fmul v17.4s, v1.4s, v0.s[3]\n"
				190	"fcvtas v23.4s, v23.4s\n"
				191	"fcvtas v22.4s, v22.4s\n"
				192	"fcvtas v21.4s, v21.4s\n"
				193	"fcvtas v20.4s, v20.4s\n"
				194	"fcvtas v19.4s, v19.4s\n"
				195	"fcvtas v18.4s, v18.4s\n"
				196	"fcvtas v16.4s, v16.4s\n"
				197	"fcvtas v17.4s, v17.4s\n"
				198	"uzp1 v22.8h, v23.8h, v22.8h\n"
				199	"uzp1 v20.8h, v21.8h, v20.8h\n"
				200	"uzp1 v18.8h, v19.8h, v18.8h\n"
				201	"uzp1 v17.8h, v16.8h, v17.8h\n"
				202	"dup v16.8h, w20\n"
				203	"add v22.8h, v22.8h, v16.8h\n"
				204	"add v20.8h, v20.8h, v16.8h\n"
				205	"add v18.8h, v18.8h, v16.8h\n"
				206	"add v17.8h, v17.8h, v16.8h\n"
				207	"movi v16.8h, #0xff\n"
				208	"smin v22.8h, v22.8h, v16.8h\n"
				209	"smin v20.8h, v20.8h, v16.8h\n"
				210	"smin v18.8h, v18.8h, v16.8h\n"
				211	"smin v17.8h, v17.8h, v16.8h\n"
				212	"movi v16.8h, #0x0\n"
				213	"smax v22.8h, v22.8h, v16.8h\n"
				214	"smax v20.8h, v20.8h, v16.8h\n"
				215	"smax v18.8h, v18.8h, v16.8h\n"
				216	"smax v17.8h, v17.8h, v16.8h\n"
				217	"xtn v22.8b, v22.8h\n"
				218	"str d22, [x25, #0x0]\n"
				219	"xtn v20.8b, v20.8h\n"
				220	"xtn v18.8b, v18.8h\n"
				221	"str d20, [x25, #0x8]\n"
				222	"xtn v17.8b, v17.8h\n"
				223	"str d18, [x22, #0x0]\n"
				224	"str d17, [x22, #0x8]\n"
				225	"3:" // Main loop: No direct output
				226	"mov v19.16b, v28.16b\n"
				227	"mov v13.16b, v29.16b\n"
				228	"fmla v19.4s, v8.4s, v24.4s\n"
				229	"ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
				230	"mov v18.16b, v30.16b\n"
				231	"mov v12.16b, v31.16b\n"
				232	"fmla v13.4s, v7.4s, v25.4s\n"
				233	"ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
				234	"mov v17.16b, v28.16b\n"
				235	"mov v10.16b, v29.16b\n"
				236	"fmla v18.4s, v6.4s, v26.4s\n"
				237	"ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
				238	"mov v16.16b, v30.16b\n"
				239	"mov v9.16b, v31.16b\n"
				240	"fmla v12.4s, v5.4s, v27.4s\n"
				241	"subs x23, x23, #0x2\n"
				242	"fmla v17.4s, v4.4s, v24.4s\n"
				243	"fmla v10.4s, v3.4s, v25.4s\n"
				244	"fmul v8.4s, v19.4s, v0.s[2]\n"
				245	"fmla v16.4s, v2.4s, v26.4s\n"
				246	"fmla v9.4s, v1.4s, v27.4s\n"
				247	"fmul v7.4s, v13.4s, v0.s[2]\n"
				248	"fmul v6.4s, v18.4s, v0.s[2]\n"
				249	"fmul v5.4s, v12.4s, v0.s[2]\n"
				250	"fmul v4.4s, v17.4s, v0.s[2]\n"
				251	"fmul v3.4s, v10.4s, v0.s[2]\n"
				252	"fmul v2.4s, v16.4s, v0.s[2]\n"
				253	"fmul v1.4s, v9.4s, v0.s[2]\n"
				254	"fcvtas v8.4s, v8.4s\n"
				255	"fcvtas v7.4s, v7.4s\n"
				256	"fcvtas v6.4s, v6.4s\n"
				257	"fcvtas v5.4s, v5.4s\n"
				258	"fcvtas v4.4s, v4.4s\n"
				259	"fcvtas v3.4s, v3.4s\n"
				260	"fcvtas v2.4s, v2.4s\n"
				261	"fcvtas v1.4s, v1.4s\n"
				262	"uzp1 v7.8h, v8.8h, v7.8h\n"
				263	"uzp1 v5.8h, v6.8h, v5.8h\n"
				264	"uzp1 v3.8h, v4.8h, v3.8h\n"
				265	"uzp1 v1.8h, v2.8h, v1.8h\n"
				266	"dup v16.8h, w22\n"
				267	"add v7.8h, v7.8h, v16.8h\n"
				268	"add v5.8h, v5.8h, v16.8h\n"
				269	"add v3.8h, v3.8h, v16.8h\n"
				270	"add v1.8h, v1.8h, v16.8h\n"
				271	"dup v16.8h, w21\n"
				272	"smin v7.8h, v7.8h, v16.8h\n"
				273	"smin v5.8h, v5.8h, v16.8h\n"
				274	"smin v3.8h, v3.8h, v16.8h\n"
				275	"smin v1.8h, v1.8h, v16.8h\n"
				276	"dup v16.8h, w20\n"
				277	"smax v7.8h, v7.8h, v16.8h\n"
				278	"smax v5.8h, v5.8h, v16.8h\n"
				279	"smax v3.8h, v3.8h, v16.8h\n"
				280	"smax v1.8h, v1.8h, v16.8h\n"
				281	"xtn v7.8b, v7.8h\n"
				282	"str d7, [x26, #0x0]\n"
				283	"xtn v5.8b, v5.8h\n"
				284	"xtn v3.8b, v3.8h\n"
				285	"str d5, [x26, #0x8]\n"
				286	"xtn v1.8b, v1.8h\n"
				287	"str d3, [x24, #0x0]\n"
				288	"str d1, [x24, #0x8]\n"
				289	"bgt 2b\n"
				290	"add %x[in0], %x[in0], #0x10\n"
				291	"add %x[in1], %x[in1], #0x10\n"
				292	"add %x[out], %x[out], #0x10\n"
				293	"cbz %x[out_direct], 4f\n"
				294	"add %x[out_direct], %x[out_direct], #0x10\n"
				295	"4:" // No direct pointer update
				296	"sub %x[width], %x[width], #0x10\n"
				297	"cmp %x[width], #0x10\n"
				298	"bge 1b\n"
				299	"cbz %x[width], 32f\n"
				300	"5:" // main loop skip
				301	"ldr q24, [%x[bn_mul], #0x0]\n"
				302	"ldr q25, [%x[bn_mul], #0x10]\n"
				303	"mov x23, %x[height]\n"
				304	"mov x12, %x[in0]\n"
				305	"ldr q26, [%x[bn_mul], #0x20]\n"
				306	"ldr q27, [%x[bn_mul], #0x30]\n"
				307	"mov x11, %x[in1]\n"
				308	"mov x10, %x[out]\n"
				309	"ldr q28, [%x[bn_add], #0x0]\n"
				310	"ldr q29, [%x[bn_add], #0x10]\n"
				311	"mov x9, %x[out_direct]\n"
				312	"add %x[bn_mul], %x[bn_mul], #0x40\n"
				313	"ldr q30, [%x[bn_add], #0x20]\n"
				314	"ldr q31, [%x[bn_add], #0x30]\n"
				315	"add %x[bn_add], %x[bn_add], #0x40\n"
				316	"6:" // tail loop: Row loop
				317	"mov x28, x12\n"
				318	"mov x27, x11\n"
				319	"mov x26, x10\n"
				320	"mov x25, x9\n"
				321	"add x21, x28, %x[in0_stride]\n"
				322	"add x20, x27, %x[in1_stride]\n"
				323	"add x24, x26, %x[out_stride]\n"
				324	"add x22, x25, %x[out_direct_stride]\n"
				325	"cmp x23, #0x2\n"
				326	"add x12, x21, %x[in0_stride]\n"
				327	"add x11, x20, %x[in1_stride]\n"
				328	"add x10, x24, %x[out_stride]\n"
				329	"add x9, x22, %x[out_direct_stride]\n"
				330	"csel x21, x21, x28, GE\n"
				331	"csel x20, x20, x27, GE\n"
				332	"csel x24, x24, x26, GE\n"
				333	"csel x22, x22, x25, GE\n"
				334	"tbz %x[width], #3, 10f\n"
				335	"ldr d4, [x28, #0x0]\n"
				336	"ldr d13, [x27, #0x0]\n"
				337	"add x28, x28, #0x8\n"
				338	"add x27, x27, #0x8\n"
				339	"ldr d2, [x21, #0x0]\n"
				340	"ldr d10, [x20, #0x0]\n"
				341	"add x21, x21, #0x8\n"
				342	"add x20, x20, #0x8\n"
				343	"tbz %x[width], #2, 8f\n"
				344	"ldr s3, [x28], #0x4\n"
				345	"ldr s12, [x27], #0x4\n"
				346	"ldr s11, [x21], #0x4\n"
				347	"ldr s9, [x20], #0x4\n"
				348	"tbz %x[width], #1, 7f\n"
				349	"ld1 { v3.h }[2], [x28], #0x2\n"
				350	"ld1 { v12.h }[2], [x27], #0x2\n"
				351	"ld1 { v11.h }[2], [x21], #0x2\n"
				352	"ld1 { v9.h }[2], [x20], #0x2\n"
				353	"tbz %x[width], #0, 14f\n"
				354	"ld1 { v3.b }[6], [x28], #0x1\n"
				355	"ld1 { v12.b }[6], [x27], #0x1\n"
				356	"ld1 { v11.b }[6], [x21], #0x1\n"
				357	"ld1 { v9.b }[6], [x20], #0x1\n"
				358	"b 14f\n"
				359	"7:" // tail loop: unique 1: partial_0_12
				360	"tbz %x[width], #0, 14f\n"
				361	"ld1 { v3.b }[4], [x28], #0x1\n"
				362	"ld1 { v12.b }[4], [x27], #0x1\n"
				363	"ld1 { v11.b }[4], [x21], #0x1\n"
				364	"ld1 { v9.b }[4], [x20], #0x1\n"
				365	"b 14f\n"
				366	"8:" // tail loop: unique 1: partial_1_8
				367	"tbz %x[width], #1, 9f\n"
				368	"ldr h3, [x28], #0x2\n"
				369	"ldr h12, [x27], #0x2\n"
				370	"ldr h11, [x21], #0x2\n"
				371	"ldr h9, [x20], #0x2\n"
				372	"tbz %x[width], #0, 14f\n"
				373	"ld1 { v3.b }[2], [x28], #0x1\n"
				374	"ld1 { v12.b }[2], [x27], #0x1\n"
				375	"ld1 { v11.b }[2], [x21], #0x1\n"
				376	"ld1 { v9.b }[2], [x20], #0x1\n"
				377	"b 14f\n"
				378	"9:" // tail loop: unique 1: partial_0_8
				379	"tbz %x[width], #0, 14f\n"
				380	"ldr b3, [x28], #0x1\n"
				381	"ldr b12, [x27], #0x1\n"
				382	"ldr b11, [x21], #0x1\n"
				383	"ldr b9, [x20], #0x1\n"
				384	"b 14f\n"
				385	"10:" // tail loop: unique 1: partial_2_0
				386	"tbz %x[width], #2, 12f\n"
				387	"ldr s4, [x28], #0x4\n"
				388	"ldr s13, [x27], #0x4\n"
				389	"ldr s2, [x21], #0x4\n"
				390	"ldr s10, [x20], #0x4\n"
				391	"tbz %x[width], #1, 11f\n"
				392	"ld1 { v4.h }[2], [x28], #0x2\n"
				393	"ld1 { v13.h }[2], [x27], #0x2\n"
				394	"ld1 { v2.h }[2], [x21], #0x2\n"
				395	"ld1 { v10.h }[2], [x20], #0x2\n"
				396	"tbz %x[width], #0, 14f\n"
				397	"ld1 { v4.b }[6], [x28], #0x1\n"
				398	"ld1 { v13.b }[6], [x27], #0x1\n"
				399	"ld1 { v2.b }[6], [x21], #0x1\n"
				400	"ld1 { v10.b }[6], [x20], #0x1\n"
				401	"b 14f\n"
				402	"11:" // tail loop: unique 1: partial_0_4
				403	"tbz %x[width], #0, 14f\n"
				404	"ld1 { v4.b }[4], [x28], #0x1\n"
				405	"ld1 { v13.b }[4], [x27], #0x1\n"
				406	"ld1 { v2.b }[4], [x21], #0x1\n"
				407	"ld1 { v10.b }[4], [x20], #0x1\n"
				408	"b 14f\n"
				409	"12:" // tail loop: unique 1: partial_1_0
				410	"tbz %x[width], #1, 13f\n"
				411	"ldr h4, [x28], #0x2\n"
				412	"ldr h13, [x27], #0x2\n"
				413	"ldr h2, [x21], #0x2\n"
				414	"ldr h10, [x20], #0x2\n"
				415	"tbz %x[width], #0, 14f\n"
				416	"ld1 { v4.b }[2], [x28], #0x1\n"
				417	"ld1 { v13.b }[2], [x27], #0x1\n"
				418	"ld1 { v2.b }[2], [x21], #0x1\n"
				419	"ld1 { v10.b }[2], [x20], #0x1\n"
				420	"b 14f\n"
				421	"13:" // tail loop: unique 1: partial_0_0
				422	"ldr b4, [x28], #0x1\n"
				423	"ldr b13, [x27], #0x1\n"
				424	"ldr b2, [x21], #0x1\n"
				425	"ldr b10, [x20], #0x1\n"
				426	"14:" // tail loop: unique 1: Done
				427	"ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
				428	"ushll v4.8h, v4.8b, #0x0\n"
				429	"ushll v3.8h, v3.8b, #0x0\n"
				430	"ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
				431	"ushll v2.8h, v2.8b, #0x0\n"
				432	"ushll v11.8h, v11.8b, #0x0\n"
				433	"dup v16.8h, w21\n"
				434	"ushll v13.8h, v13.8b, #0x0\n"
				435	"ushll v12.8h, v12.8b, #0x0\n"
				436	"ushll v10.8h, v10.8b, #0x0\n"
				437	"ushll v9.8h, v9.8b, #0x0\n"
				438	"ssubl v1.4s, v4.4h, v16.4h\n"
				439	"ssubl2 v4.4s, v4.8h, v16.8h\n"
				440	"ssubl v23.4s, v3.4h, v16.4h\n"
				441	"ssubl2 v3.4s, v3.8h, v16.8h\n"
				442	"ssubl v22.4s, v2.4h, v16.4h\n"
				443	"ssubl2 v2.4s, v2.8h, v16.8h\n"
				444	"ssubl v21.4s, v11.4h, v16.4h\n"
				445	"ssubl2 v11.4s, v11.8h, v16.8h\n"
				446	"dup v20.8h, w20\n"
				447	"ssubl v19.4s, v13.4h, v20.4h\n"
				448	"ssubl2 v13.4s, v13.8h, v20.8h\n"
				449	"ssubl v18.4s, v12.4h, v20.4h\n"
				450	"ssubl2 v12.4s, v12.8h, v20.8h\n"
				451	"ssubl v17.4s, v10.4h, v20.4h\n"
				452	"ssubl2 v10.4s, v10.8h, v20.8h\n"
				453	"ssubl v16.4s, v9.4h, v20.4h\n"
				454	"ssubl2 v9.4s, v9.8h, v20.8h\n"
				455	"scvtf v8.4s, v1.4s\n"
				456	"scvtf v7.4s, v4.4s\n"
				457	"scvtf v6.4s, v23.4s\n"
				458	"scvtf v5.4s, v3.4s\n"
				459	"scvtf v4.4s, v22.4s\n"
				460	"scvtf v3.4s, v2.4s\n"
				461	"scvtf v2.4s, v21.4s\n"
				462	"scvtf v1.4s, v11.4s\n"
				463	"scvtf v19.4s, v19.4s\n"
				464	"fmul v8.4s, v8.4s, v0.s[0]\n"
				465	"fmla v8.4s, v19.4s, v0.s[1]\n"
				466	"scvtf v13.4s, v13.4s\n"
				467	"fmul v7.4s, v7.4s, v0.s[0]\n"
				468	"fmla v7.4s, v13.4s, v0.s[1]\n"
				469	"scvtf v18.4s, v18.4s\n"
				470	"fmul v6.4s, v6.4s, v0.s[0]\n"
				471	"fmla v6.4s, v18.4s, v0.s[1]\n"
				472	"scvtf v12.4s, v12.4s\n"
				473	"fmul v5.4s, v5.4s, v0.s[0]\n"
				474	"fmla v5.4s, v12.4s, v0.s[1]\n"
				475	"scvtf v17.4s, v17.4s\n"
				476	"fmul v4.4s, v4.4s, v0.s[0]\n"
				477	"fmla v4.4s, v17.4s, v0.s[1]\n"
				478	"scvtf v10.4s, v10.4s\n"
				479	"fmul v3.4s, v3.4s, v0.s[0]\n"
				480	"fmla v3.4s, v10.4s, v0.s[1]\n"
				481	"scvtf v16.4s, v16.4s\n"
				482	"fmul v2.4s, v2.4s, v0.s[0]\n"
				483	"fmla v2.4s, v16.4s, v0.s[1]\n"
				484	"scvtf v9.4s, v9.4s\n"
				485	"fmul v1.4s, v1.4s, v0.s[0]\n"
				486	"fmla v1.4s, v9.4s, v0.s[1]\n"
				487	"cbz %x[out_direct], 23f\n"
				488	"fmul v23.4s, v8.4s, v0.s[3]\n"
				489	"fmul v22.4s, v7.4s, v0.s[3]\n"
				490	"ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
				491	"fmul v21.4s, v6.4s, v0.s[3]\n"
				492	"fmul v20.4s, v5.4s, v0.s[3]\n"
				493	"fmul v19.4s, v4.4s, v0.s[3]\n"
				494	"fmul v18.4s, v3.4s, v0.s[3]\n"
				495	"fmul v16.4s, v2.4s, v0.s[3]\n"
				496	"fmul v17.4s, v1.4s, v0.s[3]\n"
				497	"fcvtas v23.4s, v23.4s\n"
				498	"fcvtas v22.4s, v22.4s\n"
				499	"fcvtas v21.4s, v21.4s\n"
				500	"fcvtas v20.4s, v20.4s\n"
				501	"fcvtas v19.4s, v19.4s\n"
				502	"fcvtas v18.4s, v18.4s\n"
				503	"fcvtas v16.4s, v16.4s\n"
				504	"fcvtas v17.4s, v17.4s\n"
				505	"uzp1 v22.8h, v23.8h, v22.8h\n"
				506	"uzp1 v20.8h, v21.8h, v20.8h\n"
				507	"uzp1 v18.8h, v19.8h, v18.8h\n"
				508	"uzp1 v17.8h, v16.8h, v17.8h\n"
				509	"dup v16.8h, w20\n"
				510	"add v22.8h, v22.8h, v16.8h\n"
				511	"add v20.8h, v20.8h, v16.8h\n"
				512	"add v18.8h, v18.8h, v16.8h\n"
				513	"add v17.8h, v17.8h, v16.8h\n"
				514	"movi v16.8h, #0xff\n"
				515	"smin v22.8h, v22.8h, v16.8h\n"
				516	"smin v20.8h, v20.8h, v16.8h\n"
				517	"smin v18.8h, v18.8h, v16.8h\n"
				518	"smin v17.8h, v17.8h, v16.8h\n"
				519	"movi v16.8h, #0x0\n"
				520	"smax v22.8h, v22.8h, v16.8h\n"
				521	"smax v20.8h, v20.8h, v16.8h\n"
				522	"smax v18.8h, v18.8h, v16.8h\n"
				523	"smax v17.8h, v17.8h, v16.8h\n"
				524	"xtn v22.8b, v22.8h\n"
				525	"xtn v20.8b, v20.8h\n"
				526	"xtn v18.8b, v18.8h\n"
				527	"xtn v17.8b, v17.8h\n"
				528	"tbz %x[width], #3, 18f\n"
				529	"str d22, [x25, #0x0]\n"
				530	"add x25, x25, #0x8\n"
				531	"str d18, [x22, #0x0]\n"
				532	"add x22, x22, #0x8\n"
				533	"tbz %x[width], #2, 16f\n"
				534	"str s20, [x25], #0x4\n"
				535	"str s17, [x22], #0x4\n"
				536	"tbz %x[width], #1, 15f\n"
				537	"st1 { v20.h }[2], [x25], #0x2\n"
				538	"st1 { v17.h }[2], [x22], #0x2\n"
				539	"tbz %x[width], #0, 22f\n"
				540	"st1 { v20.b }[6], [x25], #0x1\n"
				541	"st1 { v17.b }[6], [x22], #0x1\n"
				542	"b 22f\n"
				543	"15:" // tail loop: Main loop: unique 2: partial_0_12
				544	"tbz %x[width], #0, 22f\n"
				545	"st1 { v20.b }[4], [x25], #0x1\n"
				546	"st1 { v17.b }[4], [x22], #0x1\n"
				547	"b 22f\n"
				548	"16:" // tail loop: Main loop: unique 2: partial_1_8
				549	"tbz %x[width], #1, 17f\n"
				550	"str h20, [x25], #0x2\n"
				551	"str h17, [x22], #0x2\n"
				552	"tbz %x[width], #0, 22f\n"
				553	"st1 { v20.b }[2], [x25], #0x1\n"
				554	"st1 { v17.b }[2], [x22], #0x1\n"
				555	"b 22f\n"
				556	"17:" // tail loop: Main loop: unique 2: partial_0_8
				557	"tbz %x[width], #0, 22f\n"
				558	"str b20, [x25], #0x1\n"
				559	"str b17, [x22], #0x1\n"
				560	"b 22f\n"
				561	"18:" // tail loop: Main loop: unique 2: partial_2_0
				562	"tbz %x[width], #2, 20f\n"
				563	"str s22, [x25], #0x4\n"
				564	"str s18, [x22], #0x4\n"
				565	"tbz %x[width], #1, 19f\n"
				566	"st1 { v22.h }[2], [x25], #0x2\n"
				567	"st1 { v18.h }[2], [x22], #0x2\n"
				568	"tbz %x[width], #0, 22f\n"
				569	"st1 { v22.b }[6], [x25], #0x1\n"
				570	"st1 { v18.b }[6], [x22], #0x1\n"
				571	"b 22f\n"
				572	"19:" // tail loop: Main loop: unique 2: partial_0_4
				573	"tbz %x[width], #0, 22f\n"
				574	"st1 { v22.b }[4], [x25], #0x1\n"
				575	"st1 { v18.b }[4], [x22], #0x1\n"
				576	"b 22f\n"
				577	"20:" // tail loop: Main loop: unique 2: partial_1_0
				578	"tbz %x[width], #1, 21f\n"
				579	"str h22, [x25], #0x2\n"
				580	"str h18, [x22], #0x2\n"
				581	"tbz %x[width], #0, 22f\n"
				582	"st1 { v22.b }[2], [x25], #0x1\n"
				583	"st1 { v18.b }[2], [x22], #0x1\n"
				584	"b 22f\n"
				585	"21:" // tail loop: Main loop: unique 2: partial_0_0
				586	"str b22, [x25], #0x1\n"
				587	"str b18, [x22], #0x1\n"
				588	"22:" // tail loop: Main loop: unique 2: Done
				589	"23:" // tail loop: Main loop: No direct output
				590	"mov v19.16b, v28.16b\n"
				591	"mov v13.16b, v29.16b\n"
				592	"fmla v19.4s, v8.4s, v24.4s\n"
				593	"ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
				594	"mov v18.16b, v30.16b\n"
				595	"mov v12.16b, v31.16b\n"
				596	"fmla v13.4s, v7.4s, v25.4s\n"
				597	"ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
				598	"mov v17.16b, v28.16b\n"
				599	"mov v10.16b, v29.16b\n"
				600	"fmla v18.4s, v6.4s, v26.4s\n"
				601	"ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
				602	"mov v16.16b, v30.16b\n"
				603	"mov v9.16b, v31.16b\n"
				604	"fmla v12.4s, v5.4s, v27.4s\n"
				605	"fmla v17.4s, v4.4s, v24.4s\n"
				606	"fmla v10.4s, v3.4s, v25.4s\n"
				607	"fmul v8.4s, v19.4s, v0.s[2]\n"
				608	"fmla v16.4s, v2.4s, v26.4s\n"
				609	"fmla v9.4s, v1.4s, v27.4s\n"
				610	"fmul v7.4s, v13.4s, v0.s[2]\n"
				611	"fmul v6.4s, v18.4s, v0.s[2]\n"
				612	"fmul v5.4s, v12.4s, v0.s[2]\n"
				613	"fmul v4.4s, v17.4s, v0.s[2]\n"
				614	"fmul v3.4s, v10.4s, v0.s[2]\n"
				615	"fmul v2.4s, v16.4s, v0.s[2]\n"
				616	"fmul v1.4s, v9.4s, v0.s[2]\n"
				617	"fcvtas v8.4s, v8.4s\n"
				618	"fcvtas v7.4s, v7.4s\n"
				619	"fcvtas v6.4s, v6.4s\n"
				620	"fcvtas v5.4s, v5.4s\n"
				621	"fcvtas v4.4s, v4.4s\n"
				622	"fcvtas v3.4s, v3.4s\n"
				623	"fcvtas v2.4s, v2.4s\n"
				624	"fcvtas v1.4s, v1.4s\n"
				625	"uzp1 v7.8h, v8.8h, v7.8h\n"
				626	"uzp1 v5.8h, v6.8h, v5.8h\n"
				627	"uzp1 v3.8h, v4.8h, v3.8h\n"
				628	"uzp1 v1.8h, v2.8h, v1.8h\n"
				629	"dup v16.8h, w22\n"
				630	"add v7.8h, v7.8h, v16.8h\n"
				631	"add v5.8h, v5.8h, v16.8h\n"
				632	"add v3.8h, v3.8h, v16.8h\n"
				633	"add v1.8h, v1.8h, v16.8h\n"
				634	"dup v16.8h, w21\n"
				635	"smin v7.8h, v7.8h, v16.8h\n"
				636	"smin v5.8h, v5.8h, v16.8h\n"
				637	"smin v3.8h, v3.8h, v16.8h\n"
				638	"smin v1.8h, v1.8h, v16.8h\n"
				639	"dup v16.8h, w20\n"
				640	"smax v7.8h, v7.8h, v16.8h\n"
				641	"smax v5.8h, v5.8h, v16.8h\n"
				642	"smax v3.8h, v3.8h, v16.8h\n"
				643	"smax v1.8h, v1.8h, v16.8h\n"
				644	"xtn v7.8b, v7.8h\n"
				645	"xtn v5.8b, v5.8h\n"
				646	"xtn v3.8b, v3.8h\n"
				647	"xtn v1.8b, v1.8h\n"
				648	"tbz %x[width], #3, 27f\n"
				649	"str d7, [x26, #0x0]\n"
				650	"add x26, x26, #0x8\n"
				651	"str d3, [x24, #0x0]\n"
				652	"add x24, x24, #0x8\n"
				653	"tbz %x[width], #2, 25f\n"
				654	"str s5, [x26], #0x4\n"
				655	"str s1, [x24], #0x4\n"
				656	"tbz %x[width], #1, 24f\n"
				657	"st1 { v5.h }[2], [x26], #0x2\n"
				658	"st1 { v1.h }[2], [x24], #0x2\n"
				659	"tbz %x[width], #0, 31f\n"
				660	"st1 { v5.b }[6], [x26], #0x1\n"
				661	"st1 { v1.b }[6], [x24], #0x1\n"
				662	"b 31f\n"
				663	"24:" // tail loop: unique 3: partial_0_12
				664	"tbz %x[width], #0, 31f\n"
				665	"st1 { v5.b }[4], [x26], #0x1\n"
				666	"st1 { v1.b }[4], [x24], #0x1\n"
				667	"b 31f\n"
				668	"25:" // tail loop: unique 3: partial_1_8
				669	"tbz %x[width], #1, 26f\n"
				670	"str h5, [x26], #0x2\n"
				671	"str h1, [x24], #0x2\n"
				672	"tbz %x[width], #0, 31f\n"
				673	"st1 { v5.b }[2], [x26], #0x1\n"
				674	"st1 { v1.b }[2], [x24], #0x1\n"
				675	"b 31f\n"
				676	"26:" // tail loop: unique 3: partial_0_8
				677	"tbz %x[width], #0, 31f\n"
				678	"str b5, [x26], #0x1\n"
				679	"str b1, [x24], #0x1\n"
				680	"b 31f\n"
				681	"27:" // tail loop: unique 3: partial_2_0
				682	"tbz %x[width], #2, 29f\n"
				683	"str s7, [x26], #0x4\n"
				684	"str s3, [x24], #0x4\n"
				685	"tbz %x[width], #1, 28f\n"
				686	"st1 { v7.h }[2], [x26], #0x2\n"
				687	"st1 { v3.h }[2], [x24], #0x2\n"
				688	"tbz %x[width], #0, 31f\n"
				689	"st1 { v7.b }[6], [x26], #0x1\n"
				690	"st1 { v3.b }[6], [x24], #0x1\n"
				691	"b 31f\n"
				692	"28:" // tail loop: unique 3: partial_0_4
				693	"tbz %x[width], #0, 31f\n"
				694	"st1 { v7.b }[4], [x26], #0x1\n"
				695	"st1 { v3.b }[4], [x24], #0x1\n"
				696	"b 31f\n"
				697	"29:" // tail loop: unique 3: partial_1_0
				698	"tbz %x[width], #1, 30f\n"
				699	"str h7, [x26], #0x2\n"
				700	"str h3, [x24], #0x2\n"
				701	"tbz %x[width], #0, 31f\n"
				702	"st1 { v7.b }[2], [x26], #0x1\n"
				703	"st1 { v3.b }[2], [x24], #0x1\n"
				704	"b 31f\n"
				705	"30:" // tail loop: unique 3: partial_0_0
				706	"str b7, [x26], #0x1\n"
				707	"str b3, [x24], #0x1\n"
				708	"31:" // tail loop: unique 3: Done
				709	"subs x23, x23, #0x2\n"
				710	"bgt 6b\n"
				711	"32:" // odd columns skip
				712	: [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
				713	: [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
				714	: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
				715	}
				716
				717	} // namespace
				718
				719	namespace arm_compute
				720	{
				721	namespace cpu
				722	{
				723	void add_mul_add_u8_neon(const ITensor input1, const ITensor input2, const ITensor bn_mul, const ITensor bn_add,
				724	ITensor add_output, ITensor final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
				725	{
				726	ARM_COMPUTE_UNUSED(policy);
				727
				728	const ITensorInfo *final_output_info = final_output->info();
				729	const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr;
				730	const ITensorInfo *input1_info = input1->info();
				731	const ITensorInfo *input2_info = input2->info();
				732
				733	const size_t out_stride = final_output_info->strides_in_bytes()[1];
				734	const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
				735	const size_t in0_stride = input1_info->strides_in_bytes()[1];
				736	const size_t in1_stride = input2_info->strides_in_bytes()[1];
				737
				738	uint8_t minval = std::numeric_limits<uint8_t>::lowest();
				739	uint8_t maxval = std::numeric_limits<uint8_t>::max();
				740
				741	const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
				742	if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
				743	{
				744	minval = quantize_qasymm8(0.f, final_output_qinfo);
				745	}
				746	else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
				747	{
				748	minval = quantize_qasymm8(0.f, final_output_qinfo);
				749	maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
				750	}
				751	else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
				752	{
				753	minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
				754	maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
				755	}
				756
				757	const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
				758	const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
				759	const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
				760
				761	const int32_t in1_offset = in1_qinfo.offset;
				762	const int32_t in2_offset = in2_qinfo.offset;
				763	const int32_t out_offset = final_output_qinfo.offset;
				764	const int32_t out_direct_offset = add_output_qinfo.offset;
				765
				766	const float in1_scale = in1_qinfo.scale;
				767	const float in2_scale = in2_qinfo.scale;
				768	const float out_scale = final_output_qinfo.scale;
				769	const float out_direct_scale = add_output_qinfo.scale;
				770
				771	const float bn_mul_buffer = reinterpret_cast<float >(bn_mul->buffer());
				772	const float bn_add_buffer = reinterpret_cast<float >(bn_add->buffer());
				773
				774	// Clear X & Y dimensions on execution window as we handle manually
				775	Window win = window;
				776	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				777	win.set(Window::DimY, Window::Dimension(0, 1, 1));
				778
				779	Iterator in1_it(input1, window);
				780	Iterator in2_it(input2, window);
				781	Iterator out_it(final_output, window);
				782
				783	const size_t width = window.num_iterations(0);
				784	const size_t height = window.num_iterations(1);
				785
				786	if(add_output != nullptr)
				787	{
				788	Iterator add_out_it(add_output, window);
				789	execute_window_loop(
				790	win, [&](const Coordinates &)
				791	{
				792	a64_add_bn_clamp_direct_u8_fp32_2x16(
				793	reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
				794	reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
				795	reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
				796	reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
				797	bn_mul_buffer,
				798	bn_add_buffer,
				799	minval,
				800	maxval,
				801	out_offset, out_scale,
				802	out_direct_offset, out_direct_scale,
				803	in1_offset, in1_scale,
				804	in2_offset, in2_scale,
				805	width, height);
				806	},
				807	in1_it, in2_it, add_out_it, out_it);
				808	}
				809	else
				810	{
				811	execute_window_loop(
				812	win, [&](const Coordinates &)
				813	{
				814	a64_add_bn_clamp_direct_u8_fp32_2x16(
				815	reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
				816	nullptr, out_direct_stride,
				817	reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
				818	reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
				819	bn_mul_buffer,
				820	bn_add_buffer,
				821	minval,
				822	maxval,
				823	out_offset, out_scale,
				824	out_direct_offset, out_direct_scale,
				825	in1_offset, in1_scale,
				826	in2_offset, in2_scale,
				827	width, height);
				828	},
				829	in1_it, in2_it, out_it);
				830	}
				831	}
				832	} // namespace cpu
				833	} // namespace arm_compute
				834
				835	#endif // __aarch64__