Blame - src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp - ml/ComputeLibrary

blob: 3bde83cc267cc3a3a379444942bc788609483756 [file] [log] [blame]

Gunes Bayir	ae72a46	2023-01-29 13:24:24 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2023 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#include "arm_compute/core/Helpers.h"
				26	#include "arm_compute/core/ITensor.h"
				27	#include "arm_compute/core/QuantizationInfo.h"
				28	#include "arm_compute/core/Types.h"
				29	#include "arm_compute/core/Window.h"
				30
				31	#include <cstddef>
				32	#include <cstdint>
				33	#include <limits>
				34
				35	#ifdef __aarch64__
				36	namespace
				37	{
				38	void a64_add_bn_clamp_direct_s8_fp32_2x16(
				39	int8_t *out, size_t out_stride,
				40	int8_t *out_direct, size_t out_direct_stride,
				41	const int8_t *in0, size_t in0_stride,
				42	const int8_t *in1, size_t in1_stride,
				43	const float *bn_mul,
				44	const float *bn_add,
				45	const int8_t minval,
				46	const int8_t maxval,
				47	int32_t out_zeropt, float out_scale,
				48	int32_t out_direct_zeropt, float out_direct_scale,
				49	int32_t in0_zeropt, float in0_scale,
				50	int32_t in1_zeropt, float in1_scale,
				51	size_t width, size_t height)
				52	{
				53	float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
				54	struct KernelArgs
				55	{
				56	const float *scales;
				57	int32_t in0_zeropt;
				58	int32_t in1_zeropt;
				59	int32_t out_zeropt;
				60	int32_t out_direct_zeropt;
				61	int32_t minval;
				62	int32_t maxval;
				63	} ka;
				64	ka.scales = scales;
				65	ka.in0_zeropt = in0_zeropt;
				66	ka.in1_zeropt = in1_zeropt;
				67	ka.out_zeropt = out_zeropt;
				68	ka.out_direct_zeropt = out_direct_zeropt;
				69	ka.minval = minval;
				70	ka.maxval = maxval;
				71
				72	__asm__ __volatile__(
				73	"ldr x20, [%x[args_ptr], %[offsetof_scales]]\n"
				74	"ld1 { v0.4s }, [x20]\n"
				75	"cmp %x[width], #0x10\n"
				76	"blt 5f\n"
				77	"1:" // Column loop
				78	"ldr q24, [%x[bn_mul], #0x0]\n"
				79	"ldr q25, [%x[bn_mul], #0x10]\n"
				80	"mov x23, %x[height]\n"
				81	"mov x12, %x[in0]\n"
				82	"ldr q26, [%x[bn_mul], #0x20]\n"
				83	"ldr q27, [%x[bn_mul], #0x30]\n"
				84	"mov x11, %x[in1]\n"
				85	"mov x10, %x[out]\n"
				86	"ldr q28, [%x[bn_add], #0x0]\n"
				87	"ldr q29, [%x[bn_add], #0x10]\n"
				88	"mov x9, %x[out_direct]\n"
				89	"add %x[bn_mul], %x[bn_mul], #0x40\n"
				90	"ldr q30, [%x[bn_add], #0x20]\n"
				91	"ldr q31, [%x[bn_add], #0x30]\n"
				92	"add %x[bn_add], %x[bn_add], #0x40\n"
				93	"2:" // Row loop
				94	"mov x28, x12\n"
				95	"ldr d4, [x28, #0x0]\n"
				96	"ldr d3, [x28, #0x8]\n"
				97	"add x21, x28, %x[in0_stride]\n"
				98	"mov x27, x11\n"
				99	"ldr d13, [x27, #0x0]\n"
				100	"ldr d12, [x27, #0x8]\n"
				101	"cmp x23, #0x2\n"
				102	"add x12, x21, %x[in0_stride]\n"
				103	"csel x21, x21, x28, GE\n"
				104	"ldr d2, [x21, #0x0]\n"
				105	"ldr d11, [x21, #0x8]\n"
				106	"add x20, x27, %x[in1_stride]\n"
				107	"add x11, x20, %x[in1_stride]\n"
				108	"ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
				109	"sshll v4.8h, v4.8b, #0x0\n"
				110	"csel x20, x20, x27, GE\n"
				111	"ldr d10, [x20, #0x0]\n"
				112	"ldr d9, [x20, #0x8]\n"
				113	"sshll v3.8h, v3.8b, #0x0\n"
				114	"sshll v2.8h, v2.8b, #0x0\n"
				115	"sshll v11.8h, v11.8b, #0x0\n"
				116	"ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
				117	"mov x26, x10\n"
				118	"dup v16.8h, w21\n"
				119	"sshll v13.8h, v13.8b, #0x0\n"
				120	"mov x25, x9\n"
				121	"add x24, x26, %x[out_stride]\n"
				122	"sshll v12.8h, v12.8b, #0x0\n"
				123	"sshll v10.8h, v10.8b, #0x0\n"
				124	"add x22, x25, %x[out_direct_stride]\n"
				125	"add x10, x24, %x[out_stride]\n"
				126	"sshll v9.8h, v9.8b, #0x0\n"
				127	"ssubl v1.4s, v4.4h, v16.4h\n"
				128	"add x9, x22, %x[out_direct_stride]\n"
				129	"csel x24, x24, x26, GE\n"
				130	"ssubl2 v4.4s, v4.8h, v16.8h\n"
				131	"ssubl v23.4s, v3.4h, v16.4h\n"
				132	"csel x22, x22, x25, GE\n"
				133	"ssubl2 v3.4s, v3.8h, v16.8h\n"
				134	"ssubl v22.4s, v2.4h, v16.4h\n"
				135	"ssubl2 v2.4s, v2.8h, v16.8h\n"
				136	"ssubl v21.4s, v11.4h, v16.4h\n"
				137	"ssubl2 v11.4s, v11.8h, v16.8h\n"
				138	"dup v20.8h, w20\n"
				139	"ssubl v19.4s, v13.4h, v20.4h\n"
				140	"ssubl2 v13.4s, v13.8h, v20.8h\n"
				141	"ssubl v18.4s, v12.4h, v20.4h\n"
				142	"ssubl2 v12.4s, v12.8h, v20.8h\n"
				143	"ssubl v17.4s, v10.4h, v20.4h\n"
				144	"ssubl2 v10.4s, v10.8h, v20.8h\n"
				145	"ssubl v16.4s, v9.4h, v20.4h\n"
				146	"ssubl2 v9.4s, v9.8h, v20.8h\n"
				147	"scvtf v8.4s, v1.4s\n"
				148	"scvtf v7.4s, v4.4s\n"
				149	"scvtf v6.4s, v23.4s\n"
				150	"scvtf v5.4s, v3.4s\n"
				151	"scvtf v4.4s, v22.4s\n"
				152	"scvtf v3.4s, v2.4s\n"
				153	"scvtf v2.4s, v21.4s\n"
				154	"scvtf v1.4s, v11.4s\n"
				155	"scvtf v19.4s, v19.4s\n"
				156	"fmul v8.4s, v8.4s, v0.s[0]\n"
				157	"fmla v8.4s, v19.4s, v0.s[1]\n"
				158	"scvtf v13.4s, v13.4s\n"
				159	"fmul v7.4s, v7.4s, v0.s[0]\n"
				160	"fmla v7.4s, v13.4s, v0.s[1]\n"
				161	"scvtf v18.4s, v18.4s\n"
				162	"fmul v6.4s, v6.4s, v0.s[0]\n"
				163	"fmla v6.4s, v18.4s, v0.s[1]\n"
				164	"scvtf v12.4s, v12.4s\n"
				165	"fmul v5.4s, v5.4s, v0.s[0]\n"
				166	"fmla v5.4s, v12.4s, v0.s[1]\n"
				167	"scvtf v17.4s, v17.4s\n"
				168	"fmul v4.4s, v4.4s, v0.s[0]\n"
				169	"fmla v4.4s, v17.4s, v0.s[1]\n"
				170	"scvtf v10.4s, v10.4s\n"
				171	"fmul v3.4s, v3.4s, v0.s[0]\n"
				172	"fmla v3.4s, v10.4s, v0.s[1]\n"
				173	"scvtf v16.4s, v16.4s\n"
				174	"fmul v2.4s, v2.4s, v0.s[0]\n"
				175	"fmla v2.4s, v16.4s, v0.s[1]\n"
				176	"scvtf v9.4s, v9.4s\n"
				177	"fmul v1.4s, v1.4s, v0.s[0]\n"
				178	"fmla v1.4s, v9.4s, v0.s[1]\n"
				179	"cbz %x[out_direct], 3f\n"
				180	"fmul v23.4s, v8.4s, v0.s[3]\n"
				181	"fmul v22.4s, v7.4s, v0.s[3]\n"
				182	"ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
				183	"fmul v21.4s, v6.4s, v0.s[3]\n"
				184	"fmul v20.4s, v5.4s, v0.s[3]\n"
				185	"fmul v17.4s, v4.4s, v0.s[3]\n"
				186	"fmul v19.4s, v3.4s, v0.s[3]\n"
				187	"fmul v16.4s, v2.4s, v0.s[3]\n"
				188	"fmul v18.4s, v1.4s, v0.s[3]\n"
				189	"fcvtas v23.4s, v23.4s\n"
				190	"fcvtas v22.4s, v22.4s\n"
				191	"fcvtas v21.4s, v21.4s\n"
				192	"fcvtas v20.4s, v20.4s\n"
				193	"fcvtas v17.4s, v17.4s\n"
				194	"fcvtas v19.4s, v19.4s\n"
				195	"fcvtas v16.4s, v16.4s\n"
				196	"fcvtas v18.4s, v18.4s\n"
				197	"uzp1 v22.8h, v23.8h, v22.8h\n"
				198	"uzp1 v20.8h, v21.8h, v20.8h\n"
				199	"uzp1 v19.8h, v17.8h, v19.8h\n"
				200	"uzp1 v18.8h, v16.8h, v18.8h\n"
				201	"dup v16.8h, w20\n"
				202	"add v22.8h, v22.8h, v16.8h\n"
				203	"add v20.8h, v20.8h, v16.8h\n"
				204	"add v19.8h, v19.8h, v16.8h\n"
				205	"add v18.8h, v18.8h, v16.8h\n"
				206	"movi v17.8h, #0x7f\n"
				207	"mvni v16.8h, #0x7f\n"
				208	"smin v22.8h, v22.8h, v17.8h\n"
				209	"smin v20.8h, v20.8h, v17.8h\n"
				210	"smin v19.8h, v19.8h, v17.8h\n"
				211	"smin v18.8h, v18.8h, v17.8h\n"
				212	"smax v22.8h, v22.8h, v16.8h\n"
				213	"smax v20.8h, v20.8h, v16.8h\n"
				214	"smax v19.8h, v19.8h, v16.8h\n"
				215	"smax v18.8h, v18.8h, v16.8h\n"
				216	"xtn v22.8b, v22.8h\n"
				217	"str d22, [x25, #0x0]\n"
				218	"xtn v20.8b, v20.8h\n"
				219	"xtn v19.8b, v19.8h\n"
				220	"str d20, [x25, #0x8]\n"
				221	"xtn v18.8b, v18.8h\n"
				222	"str d19, [x22, #0x0]\n"
				223	"str d18, [x22, #0x8]\n"
				224	"3:" // Main loop: No direct output
				225	"mov v19.16b, v28.16b\n"
				226	"mov v13.16b, v29.16b\n"
				227	"fmla v19.4s, v8.4s, v24.4s\n"
				228	"ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
				229	"mov v18.16b, v30.16b\n"
				230	"mov v12.16b, v31.16b\n"
				231	"fmla v13.4s, v7.4s, v25.4s\n"
				232	"ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
				233	"mov v17.16b, v28.16b\n"
				234	"mov v10.16b, v29.16b\n"
				235	"fmla v18.4s, v6.4s, v26.4s\n"
				236	"ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
				237	"mov v16.16b, v30.16b\n"
				238	"mov v9.16b, v31.16b\n"
				239	"fmla v12.4s, v5.4s, v27.4s\n"
				240	"subs x23, x23, #0x2\n"
				241	"fmla v17.4s, v4.4s, v24.4s\n"
				242	"fmla v10.4s, v3.4s, v25.4s\n"
				243	"fmul v8.4s, v19.4s, v0.s[2]\n"
				244	"fmla v16.4s, v2.4s, v26.4s\n"
				245	"fmla v9.4s, v1.4s, v27.4s\n"
				246	"fmul v7.4s, v13.4s, v0.s[2]\n"
				247	"fmul v6.4s, v18.4s, v0.s[2]\n"
				248	"fmul v5.4s, v12.4s, v0.s[2]\n"
				249	"fmul v4.4s, v17.4s, v0.s[2]\n"
				250	"fmul v3.4s, v10.4s, v0.s[2]\n"
				251	"fmul v2.4s, v16.4s, v0.s[2]\n"
				252	"fmul v1.4s, v9.4s, v0.s[2]\n"
				253	"fcvtas v8.4s, v8.4s\n"
				254	"fcvtas v7.4s, v7.4s\n"
				255	"fcvtas v6.4s, v6.4s\n"
				256	"fcvtas v5.4s, v5.4s\n"
				257	"fcvtas v4.4s, v4.4s\n"
				258	"fcvtas v3.4s, v3.4s\n"
				259	"fcvtas v2.4s, v2.4s\n"
				260	"fcvtas v1.4s, v1.4s\n"
				261	"uzp1 v7.8h, v8.8h, v7.8h\n"
				262	"uzp1 v5.8h, v6.8h, v5.8h\n"
				263	"uzp1 v3.8h, v4.8h, v3.8h\n"
				264	"uzp1 v1.8h, v2.8h, v1.8h\n"
				265	"dup v16.8h, w22\n"
				266	"add v7.8h, v7.8h, v16.8h\n"
				267	"add v5.8h, v5.8h, v16.8h\n"
				268	"add v3.8h, v3.8h, v16.8h\n"
				269	"add v1.8h, v1.8h, v16.8h\n"
				270	"dup v16.8h, w21\n"
				271	"smin v7.8h, v7.8h, v16.8h\n"
				272	"smin v5.8h, v5.8h, v16.8h\n"
				273	"smin v3.8h, v3.8h, v16.8h\n"
				274	"smin v1.8h, v1.8h, v16.8h\n"
				275	"dup v16.8h, w20\n"
				276	"smax v7.8h, v7.8h, v16.8h\n"
				277	"smax v5.8h, v5.8h, v16.8h\n"
				278	"smax v3.8h, v3.8h, v16.8h\n"
				279	"smax v1.8h, v1.8h, v16.8h\n"
				280	"xtn v7.8b, v7.8h\n"
				281	"str d7, [x26, #0x0]\n"
				282	"xtn v5.8b, v5.8h\n"
				283	"xtn v3.8b, v3.8h\n"
				284	"str d5, [x26, #0x8]\n"
				285	"xtn v1.8b, v1.8h\n"
				286	"str d3, [x24, #0x0]\n"
				287	"str d1, [x24, #0x8]\n"
				288	"bgt 2b\n"
				289	"add %x[in0], %x[in0], #0x10\n"
				290	"add %x[in1], %x[in1], #0x10\n"
				291	"add %x[out], %x[out], #0x10\n"
				292	"cbz %x[out_direct], 4f\n"
				293	"add %x[out_direct], %x[out_direct], #0x10\n"
				294	"4:" // No direct pointer update
				295	"sub %x[width], %x[width], #0x10\n"
				296	"cmp %x[width], #0x10\n"
				297	"bge 1b\n"
				298	"cbz %x[width], 32f\n"
				299	"5:" // main loop skip
				300	"ldr q24, [%x[bn_mul], #0x0]\n"
				301	"ldr q25, [%x[bn_mul], #0x10]\n"
				302	"mov x23, %x[height]\n"
				303	"mov x12, %x[in0]\n"
				304	"ldr q26, [%x[bn_mul], #0x20]\n"
				305	"ldr q27, [%x[bn_mul], #0x30]\n"
				306	"mov x11, %x[in1]\n"
				307	"mov x10, %x[out]\n"
				308	"ldr q28, [%x[bn_add], #0x0]\n"
				309	"ldr q29, [%x[bn_add], #0x10]\n"
				310	"mov x9, %x[out_direct]\n"
				311	"add %x[bn_mul], %x[bn_mul], #0x40\n"
				312	"ldr q30, [%x[bn_add], #0x20]\n"
				313	"ldr q31, [%x[bn_add], #0x30]\n"
				314	"add %x[bn_add], %x[bn_add], #0x40\n"
				315	"6:" // tail loop: Row loop
				316	"mov x28, x12\n"
				317	"mov x27, x11\n"
				318	"mov x26, x10\n"
				319	"mov x25, x9\n"
				320	"add x21, x28, %x[in0_stride]\n"
				321	"add x20, x27, %x[in1_stride]\n"
				322	"add x24, x26, %x[out_stride]\n"
				323	"add x22, x25, %x[out_direct_stride]\n"
				324	"cmp x23, #0x2\n"
				325	"add x12, x21, %x[in0_stride]\n"
				326	"add x11, x20, %x[in1_stride]\n"
				327	"add x10, x24, %x[out_stride]\n"
				328	"add x9, x22, %x[out_direct_stride]\n"
				329	"csel x21, x21, x28, GE\n"
				330	"csel x20, x20, x27, GE\n"
				331	"csel x24, x24, x26, GE\n"
				332	"csel x22, x22, x25, GE\n"
				333	"tbz %x[width], #3, 10f\n"
				334	"ldr d4, [x28, #0x0]\n"
				335	"ldr d13, [x27, #0x0]\n"
				336	"add x28, x28, #0x8\n"
				337	"add x27, x27, #0x8\n"
				338	"ldr d2, [x21, #0x0]\n"
				339	"ldr d10, [x20, #0x0]\n"
				340	"add x21, x21, #0x8\n"
				341	"add x20, x20, #0x8\n"
				342	"tbz %x[width], #2, 8f\n"
				343	"ldr s3, [x28], #0x4\n"
				344	"ldr s12, [x27], #0x4\n"
				345	"ldr s11, [x21], #0x4\n"
				346	"ldr s9, [x20], #0x4\n"
				347	"tbz %x[width], #1, 7f\n"
				348	"ld1 { v3.h }[2], [x28], #0x2\n"
				349	"ld1 { v12.h }[2], [x27], #0x2\n"
				350	"ld1 { v11.h }[2], [x21], #0x2\n"
				351	"ld1 { v9.h }[2], [x20], #0x2\n"
				352	"tbz %x[width], #0, 14f\n"
				353	"ld1 { v3.b }[6], [x28], #0x1\n"
				354	"ld1 { v12.b }[6], [x27], #0x1\n"
				355	"ld1 { v11.b }[6], [x21], #0x1\n"
				356	"ld1 { v9.b }[6], [x20], #0x1\n"
				357	"b 14f\n"
				358	"7:" // tail loop: unique 1: partial_0_12
				359	"tbz %x[width], #0, 14f\n"
				360	"ld1 { v3.b }[4], [x28], #0x1\n"
				361	"ld1 { v12.b }[4], [x27], #0x1\n"
				362	"ld1 { v11.b }[4], [x21], #0x1\n"
				363	"ld1 { v9.b }[4], [x20], #0x1\n"
				364	"b 14f\n"
				365	"8:" // tail loop: unique 1: partial_1_8
				366	"tbz %x[width], #1, 9f\n"
				367	"ldr h3, [x28], #0x2\n"
				368	"ldr h12, [x27], #0x2\n"
				369	"ldr h11, [x21], #0x2\n"
				370	"ldr h9, [x20], #0x2\n"
				371	"tbz %x[width], #0, 14f\n"
				372	"ld1 { v3.b }[2], [x28], #0x1\n"
				373	"ld1 { v12.b }[2], [x27], #0x1\n"
				374	"ld1 { v11.b }[2], [x21], #0x1\n"
				375	"ld1 { v9.b }[2], [x20], #0x1\n"
				376	"b 14f\n"
				377	"9:" // tail loop: unique 1: partial_0_8
				378	"tbz %x[width], #0, 14f\n"
				379	"ldr b3, [x28], #0x1\n"
				380	"ldr b12, [x27], #0x1\n"
				381	"ldr b11, [x21], #0x1\n"
				382	"ldr b9, [x20], #0x1\n"
				383	"b 14f\n"
				384	"10:" // tail loop: unique 1: partial_2_0
				385	"tbz %x[width], #2, 12f\n"
				386	"ldr s4, [x28], #0x4\n"
				387	"ldr s13, [x27], #0x4\n"
				388	"ldr s2, [x21], #0x4\n"
				389	"ldr s10, [x20], #0x4\n"
				390	"tbz %x[width], #1, 11f\n"
				391	"ld1 { v4.h }[2], [x28], #0x2\n"
				392	"ld1 { v13.h }[2], [x27], #0x2\n"
				393	"ld1 { v2.h }[2], [x21], #0x2\n"
				394	"ld1 { v10.h }[2], [x20], #0x2\n"
				395	"tbz %x[width], #0, 14f\n"
				396	"ld1 { v4.b }[6], [x28], #0x1\n"
				397	"ld1 { v13.b }[6], [x27], #0x1\n"
				398	"ld1 { v2.b }[6], [x21], #0x1\n"
				399	"ld1 { v10.b }[6], [x20], #0x1\n"
				400	"b 14f\n"
				401	"11:" // tail loop: unique 1: partial_0_4
				402	"tbz %x[width], #0, 14f\n"
				403	"ld1 { v4.b }[4], [x28], #0x1\n"
				404	"ld1 { v13.b }[4], [x27], #0x1\n"
				405	"ld1 { v2.b }[4], [x21], #0x1\n"
				406	"ld1 { v10.b }[4], [x20], #0x1\n"
				407	"b 14f\n"
				408	"12:" // tail loop: unique 1: partial_1_0
				409	"tbz %x[width], #1, 13f\n"
				410	"ldr h4, [x28], #0x2\n"
				411	"ldr h13, [x27], #0x2\n"
				412	"ldr h2, [x21], #0x2\n"
				413	"ldr h10, [x20], #0x2\n"
				414	"tbz %x[width], #0, 14f\n"
				415	"ld1 { v4.b }[2], [x28], #0x1\n"
				416	"ld1 { v13.b }[2], [x27], #0x1\n"
				417	"ld1 { v2.b }[2], [x21], #0x1\n"
				418	"ld1 { v10.b }[2], [x20], #0x1\n"
				419	"b 14f\n"
				420	"13:" // tail loop: unique 1: partial_0_0
				421	"ldr b4, [x28], #0x1\n"
				422	"ldr b13, [x27], #0x1\n"
				423	"ldr b2, [x21], #0x1\n"
				424	"ldr b10, [x20], #0x1\n"
				425	"14:" // tail loop: unique 1: Done
				426	"ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n"
				427	"sshll v4.8h, v4.8b, #0x0\n"
				428	"sshll v3.8h, v3.8b, #0x0\n"
				429	"ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n"
				430	"sshll v2.8h, v2.8b, #0x0\n"
				431	"sshll v11.8h, v11.8b, #0x0\n"
				432	"dup v16.8h, w21\n"
				433	"sshll v13.8h, v13.8b, #0x0\n"
				434	"sshll v12.8h, v12.8b, #0x0\n"
				435	"sshll v10.8h, v10.8b, #0x0\n"
				436	"sshll v9.8h, v9.8b, #0x0\n"
				437	"ssubl v1.4s, v4.4h, v16.4h\n"
				438	"ssubl2 v4.4s, v4.8h, v16.8h\n"
				439	"ssubl v23.4s, v3.4h, v16.4h\n"
				440	"ssubl2 v3.4s, v3.8h, v16.8h\n"
				441	"ssubl v22.4s, v2.4h, v16.4h\n"
				442	"ssubl2 v2.4s, v2.8h, v16.8h\n"
				443	"ssubl v21.4s, v11.4h, v16.4h\n"
				444	"ssubl2 v11.4s, v11.8h, v16.8h\n"
				445	"dup v20.8h, w20\n"
				446	"ssubl v19.4s, v13.4h, v20.4h\n"
				447	"ssubl2 v13.4s, v13.8h, v20.8h\n"
				448	"ssubl v18.4s, v12.4h, v20.4h\n"
				449	"ssubl2 v12.4s, v12.8h, v20.8h\n"
				450	"ssubl v17.4s, v10.4h, v20.4h\n"
				451	"ssubl2 v10.4s, v10.8h, v20.8h\n"
				452	"ssubl v16.4s, v9.4h, v20.4h\n"
				453	"ssubl2 v9.4s, v9.8h, v20.8h\n"
				454	"scvtf v8.4s, v1.4s\n"
				455	"scvtf v7.4s, v4.4s\n"
				456	"scvtf v6.4s, v23.4s\n"
				457	"scvtf v5.4s, v3.4s\n"
				458	"scvtf v4.4s, v22.4s\n"
				459	"scvtf v3.4s, v2.4s\n"
				460	"scvtf v2.4s, v21.4s\n"
				461	"scvtf v1.4s, v11.4s\n"
				462	"scvtf v19.4s, v19.4s\n"
				463	"fmul v8.4s, v8.4s, v0.s[0]\n"
				464	"fmla v8.4s, v19.4s, v0.s[1]\n"
				465	"scvtf v13.4s, v13.4s\n"
				466	"fmul v7.4s, v7.4s, v0.s[0]\n"
				467	"fmla v7.4s, v13.4s, v0.s[1]\n"
				468	"scvtf v18.4s, v18.4s\n"
				469	"fmul v6.4s, v6.4s, v0.s[0]\n"
				470	"fmla v6.4s, v18.4s, v0.s[1]\n"
				471	"scvtf v12.4s, v12.4s\n"
				472	"fmul v5.4s, v5.4s, v0.s[0]\n"
				473	"fmla v5.4s, v12.4s, v0.s[1]\n"
				474	"scvtf v17.4s, v17.4s\n"
				475	"fmul v4.4s, v4.4s, v0.s[0]\n"
				476	"fmla v4.4s, v17.4s, v0.s[1]\n"
				477	"scvtf v10.4s, v10.4s\n"
				478	"fmul v3.4s, v3.4s, v0.s[0]\n"
				479	"fmla v3.4s, v10.4s, v0.s[1]\n"
				480	"scvtf v16.4s, v16.4s\n"
				481	"fmul v2.4s, v2.4s, v0.s[0]\n"
				482	"fmla v2.4s, v16.4s, v0.s[1]\n"
				483	"scvtf v9.4s, v9.4s\n"
				484	"fmul v1.4s, v1.4s, v0.s[0]\n"
				485	"fmla v1.4s, v9.4s, v0.s[1]\n"
				486	"cbz %x[out_direct], 23f\n"
				487	"fmul v23.4s, v8.4s, v0.s[3]\n"
				488	"fmul v22.4s, v7.4s, v0.s[3]\n"
				489	"ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n"
				490	"fmul v21.4s, v6.4s, v0.s[3]\n"
				491	"fmul v20.4s, v5.4s, v0.s[3]\n"
				492	"fmul v17.4s, v4.4s, v0.s[3]\n"
				493	"fmul v19.4s, v3.4s, v0.s[3]\n"
				494	"fmul v16.4s, v2.4s, v0.s[3]\n"
				495	"fmul v18.4s, v1.4s, v0.s[3]\n"
				496	"fcvtas v23.4s, v23.4s\n"
				497	"fcvtas v22.4s, v22.4s\n"
				498	"fcvtas v21.4s, v21.4s\n"
				499	"fcvtas v20.4s, v20.4s\n"
				500	"fcvtas v17.4s, v17.4s\n"
				501	"fcvtas v19.4s, v19.4s\n"
				502	"fcvtas v16.4s, v16.4s\n"
				503	"fcvtas v18.4s, v18.4s\n"
				504	"uzp1 v22.8h, v23.8h, v22.8h\n"
				505	"uzp1 v20.8h, v21.8h, v20.8h\n"
				506	"uzp1 v19.8h, v17.8h, v19.8h\n"
				507	"uzp1 v18.8h, v16.8h, v18.8h\n"
				508	"dup v16.8h, w20\n"
				509	"add v22.8h, v22.8h, v16.8h\n"
				510	"add v20.8h, v20.8h, v16.8h\n"
				511	"add v19.8h, v19.8h, v16.8h\n"
				512	"add v18.8h, v18.8h, v16.8h\n"
				513	"movi v17.8h, #0x7f\n"
				514	"mvni v16.8h, #0x7f\n"
				515	"smin v22.8h, v22.8h, v17.8h\n"
				516	"smin v20.8h, v20.8h, v17.8h\n"
				517	"smin v19.8h, v19.8h, v17.8h\n"
				518	"smin v18.8h, v18.8h, v17.8h\n"
				519	"smax v22.8h, v22.8h, v16.8h\n"
				520	"smax v20.8h, v20.8h, v16.8h\n"
				521	"smax v19.8h, v19.8h, v16.8h\n"
				522	"smax v18.8h, v18.8h, v16.8h\n"
				523	"xtn v22.8b, v22.8h\n"
				524	"xtn v20.8b, v20.8h\n"
				525	"xtn v19.8b, v19.8h\n"
				526	"xtn v18.8b, v18.8h\n"
				527	"tbz %x[width], #3, 18f\n"
				528	"str d22, [x25, #0x0]\n"
				529	"add x25, x25, #0x8\n"
				530	"str d19, [x22, #0x0]\n"
				531	"add x22, x22, #0x8\n"
				532	"tbz %x[width], #2, 16f\n"
				533	"str s20, [x25], #0x4\n"
				534	"str s18, [x22], #0x4\n"
				535	"tbz %x[width], #1, 15f\n"
				536	"st1 { v20.h }[2], [x25], #0x2\n"
				537	"st1 { v18.h }[2], [x22], #0x2\n"
				538	"tbz %x[width], #0, 22f\n"
				539	"st1 { v20.b }[6], [x25], #0x1\n"
				540	"st1 { v18.b }[6], [x22], #0x1\n"
				541	"b 22f\n"
				542	"15:" // tail loop: Main loop: unique 2: partial_0_12
				543	"tbz %x[width], #0, 22f\n"
				544	"st1 { v20.b }[4], [x25], #0x1\n"
				545	"st1 { v18.b }[4], [x22], #0x1\n"
				546	"b 22f\n"
				547	"16:" // tail loop: Main loop: unique 2: partial_1_8
				548	"tbz %x[width], #1, 17f\n"
				549	"str h20, [x25], #0x2\n"
				550	"str h18, [x22], #0x2\n"
				551	"tbz %x[width], #0, 22f\n"
				552	"st1 { v20.b }[2], [x25], #0x1\n"
				553	"st1 { v18.b }[2], [x22], #0x1\n"
				554	"b 22f\n"
				555	"17:" // tail loop: Main loop: unique 2: partial_0_8
				556	"tbz %x[width], #0, 22f\n"
				557	"str b20, [x25], #0x1\n"
				558	"str b18, [x22], #0x1\n"
				559	"b 22f\n"
				560	"18:" // tail loop: Main loop: unique 2: partial_2_0
				561	"tbz %x[width], #2, 20f\n"
				562	"str s22, [x25], #0x4\n"
				563	"str s19, [x22], #0x4\n"
				564	"tbz %x[width], #1, 19f\n"
				565	"st1 { v22.h }[2], [x25], #0x2\n"
				566	"st1 { v19.h }[2], [x22], #0x2\n"
				567	"tbz %x[width], #0, 22f\n"
				568	"st1 { v22.b }[6], [x25], #0x1\n"
				569	"st1 { v19.b }[6], [x22], #0x1\n"
				570	"b 22f\n"
				571	"19:" // tail loop: Main loop: unique 2: partial_0_4
				572	"tbz %x[width], #0, 22f\n"
				573	"st1 { v22.b }[4], [x25], #0x1\n"
				574	"st1 { v19.b }[4], [x22], #0x1\n"
				575	"b 22f\n"
				576	"20:" // tail loop: Main loop: unique 2: partial_1_0
				577	"tbz %x[width], #1, 21f\n"
				578	"str h22, [x25], #0x2\n"
				579	"str h19, [x22], #0x2\n"
				580	"tbz %x[width], #0, 22f\n"
				581	"st1 { v22.b }[2], [x25], #0x1\n"
				582	"st1 { v19.b }[2], [x22], #0x1\n"
				583	"b 22f\n"
				584	"21:" // tail loop: Main loop: unique 2: partial_0_0
				585	"str b22, [x25], #0x1\n"
				586	"str b19, [x22], #0x1\n"
				587	"22:" // tail loop: Main loop: unique 2: Done
				588	"23:" // tail loop: Main loop: No direct output
				589	"mov v19.16b, v28.16b\n"
				590	"mov v13.16b, v29.16b\n"
				591	"fmla v19.4s, v8.4s, v24.4s\n"
				592	"ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n"
				593	"mov v18.16b, v30.16b\n"
				594	"mov v12.16b, v31.16b\n"
				595	"fmla v13.4s, v7.4s, v25.4s\n"
				596	"ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n"
				597	"mov v17.16b, v28.16b\n"
				598	"mov v10.16b, v29.16b\n"
				599	"fmla v18.4s, v6.4s, v26.4s\n"
				600	"ldr w20, [%x[args_ptr], %[offsetof_minval]]\n"
				601	"mov v16.16b, v30.16b\n"
				602	"mov v9.16b, v31.16b\n"
				603	"fmla v12.4s, v5.4s, v27.4s\n"
				604	"fmla v17.4s, v4.4s, v24.4s\n"
				605	"fmla v10.4s, v3.4s, v25.4s\n"
				606	"fmul v8.4s, v19.4s, v0.s[2]\n"
				607	"fmla v16.4s, v2.4s, v26.4s\n"
				608	"fmla v9.4s, v1.4s, v27.4s\n"
				609	"fmul v7.4s, v13.4s, v0.s[2]\n"
				610	"fmul v6.4s, v18.4s, v0.s[2]\n"
				611	"fmul v5.4s, v12.4s, v0.s[2]\n"
				612	"fmul v4.4s, v17.4s, v0.s[2]\n"
				613	"fmul v3.4s, v10.4s, v0.s[2]\n"
				614	"fmul v2.4s, v16.4s, v0.s[2]\n"
				615	"fmul v1.4s, v9.4s, v0.s[2]\n"
				616	"fcvtas v8.4s, v8.4s\n"
				617	"fcvtas v7.4s, v7.4s\n"
				618	"fcvtas v6.4s, v6.4s\n"
				619	"fcvtas v5.4s, v5.4s\n"
				620	"fcvtas v4.4s, v4.4s\n"
				621	"fcvtas v3.4s, v3.4s\n"
				622	"fcvtas v2.4s, v2.4s\n"
				623	"fcvtas v1.4s, v1.4s\n"
				624	"uzp1 v7.8h, v8.8h, v7.8h\n"
				625	"uzp1 v5.8h, v6.8h, v5.8h\n"
				626	"uzp1 v3.8h, v4.8h, v3.8h\n"
				627	"uzp1 v1.8h, v2.8h, v1.8h\n"
				628	"dup v16.8h, w22\n"
				629	"add v7.8h, v7.8h, v16.8h\n"
				630	"add v5.8h, v5.8h, v16.8h\n"
				631	"add v3.8h, v3.8h, v16.8h\n"
				632	"add v1.8h, v1.8h, v16.8h\n"
				633	"dup v16.8h, w21\n"
				634	"smin v7.8h, v7.8h, v16.8h\n"
				635	"smin v5.8h, v5.8h, v16.8h\n"
				636	"smin v3.8h, v3.8h, v16.8h\n"
				637	"smin v1.8h, v1.8h, v16.8h\n"
				638	"dup v16.8h, w20\n"
				639	"smax v7.8h, v7.8h, v16.8h\n"
				640	"smax v5.8h, v5.8h, v16.8h\n"
				641	"smax v3.8h, v3.8h, v16.8h\n"
				642	"smax v1.8h, v1.8h, v16.8h\n"
				643	"xtn v7.8b, v7.8h\n"
				644	"xtn v5.8b, v5.8h\n"
				645	"xtn v3.8b, v3.8h\n"
				646	"xtn v1.8b, v1.8h\n"
				647	"tbz %x[width], #3, 27f\n"
				648	"str d7, [x26, #0x0]\n"
				649	"add x26, x26, #0x8\n"
				650	"str d3, [x24, #0x0]\n"
				651	"add x24, x24, #0x8\n"
				652	"tbz %x[width], #2, 25f\n"
				653	"str s5, [x26], #0x4\n"
				654	"str s1, [x24], #0x4\n"
				655	"tbz %x[width], #1, 24f\n"
				656	"st1 { v5.h }[2], [x26], #0x2\n"
				657	"st1 { v1.h }[2], [x24], #0x2\n"
				658	"tbz %x[width], #0, 31f\n"
				659	"st1 { v5.b }[6], [x26], #0x1\n"
				660	"st1 { v1.b }[6], [x24], #0x1\n"
				661	"b 31f\n"
				662	"24:" // tail loop: unique 3: partial_0_12
				663	"tbz %x[width], #0, 31f\n"
				664	"st1 { v5.b }[4], [x26], #0x1\n"
				665	"st1 { v1.b }[4], [x24], #0x1\n"
				666	"b 31f\n"
				667	"25:" // tail loop: unique 3: partial_1_8
				668	"tbz %x[width], #1, 26f\n"
				669	"str h5, [x26], #0x2\n"
				670	"str h1, [x24], #0x2\n"
				671	"tbz %x[width], #0, 31f\n"
				672	"st1 { v5.b }[2], [x26], #0x1\n"
				673	"st1 { v1.b }[2], [x24], #0x1\n"
				674	"b 31f\n"
				675	"26:" // tail loop: unique 3: partial_0_8
				676	"tbz %x[width], #0, 31f\n"
				677	"str b5, [x26], #0x1\n"
				678	"str b1, [x24], #0x1\n"
				679	"b 31f\n"
				680	"27:" // tail loop: unique 3: partial_2_0
				681	"tbz %x[width], #2, 29f\n"
				682	"str s7, [x26], #0x4\n"
				683	"str s3, [x24], #0x4\n"
				684	"tbz %x[width], #1, 28f\n"
				685	"st1 { v7.h }[2], [x26], #0x2\n"
				686	"st1 { v3.h }[2], [x24], #0x2\n"
				687	"tbz %x[width], #0, 31f\n"
				688	"st1 { v7.b }[6], [x26], #0x1\n"
				689	"st1 { v3.b }[6], [x24], #0x1\n"
				690	"b 31f\n"
				691	"28:" // tail loop: unique 3: partial_0_4
				692	"tbz %x[width], #0, 31f\n"
				693	"st1 { v7.b }[4], [x26], #0x1\n"
				694	"st1 { v3.b }[4], [x24], #0x1\n"
				695	"b 31f\n"
				696	"29:" // tail loop: unique 3: partial_1_0
				697	"tbz %x[width], #1, 30f\n"
				698	"str h7, [x26], #0x2\n"
				699	"str h3, [x24], #0x2\n"
				700	"tbz %x[width], #0, 31f\n"
				701	"st1 { v7.b }[2], [x26], #0x1\n"
				702	"st1 { v3.b }[2], [x24], #0x1\n"
				703	"b 31f\n"
				704	"30:" // tail loop: unique 3: partial_0_0
				705	"str b7, [x26], #0x1\n"
				706	"str b3, [x24], #0x1\n"
				707	"31:" // tail loop: unique 3: Done
				708	"subs x23, x23, #0x2\n"
				709	"bgt 6b\n"
				710	"32:" // odd columns skip
				711	: [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
				712	: [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
				713	: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
				714	}
				715
				716	} // namespace
				717
				718	namespace arm_compute
				719	{
				720	namespace cpu
				721	{
				722	void add_mul_add_s8_neon(const ITensor input1, const ITensor input2, const ITensor bn_mul, const ITensor bn_add,
				723	ITensor add_output, ITensor final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
				724	{
				725	ARM_COMPUTE_UNUSED(policy);
				726
				727	const ITensorInfo *final_output_info = final_output->info();
				728	const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr;
				729	const ITensorInfo *input1_info = input1->info();
				730	const ITensorInfo *input2_info = input2->info();
				731
				732	const size_t out_stride = final_output_info->strides_in_bytes()[1];
				733	const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0;
				734	const size_t in0_stride = input1_info->strides_in_bytes()[1];
				735	const size_t in1_stride = input2_info->strides_in_bytes()[1];
				736
				737	int8_t minval = std::numeric_limits<int8_t>::lowest();
				738	int8_t maxval = std::numeric_limits<int8_t>::max();
				739
				740	const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
				741	if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
				742	{
				743	minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
				744	}
				745	else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
				746	{
				747	minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
				748	maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
				749	}
				750	else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
				751	{
				752	minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
				753	maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
				754	}
				755
				756	const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
				757	const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
				758	const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
				759
				760	const int32_t in1_offset = in1_qinfo.offset;
				761	const int32_t in2_offset = in2_qinfo.offset;
				762	const int32_t out_offset = final_output_qinfo.offset;
				763	const int32_t out_direct_offset = add_output_qinfo.offset;
				764
				765	const float in1_scale = in1_qinfo.scale;
				766	const float in2_scale = in2_qinfo.scale;
				767	const float out_scale = final_output_qinfo.scale;
				768	const float out_direct_scale = add_output_qinfo.scale;
				769
				770	const float bn_mul_buffer = reinterpret_cast<float >(bn_mul->buffer());
				771	const float bn_add_buffer = reinterpret_cast<float >(bn_add->buffer());
				772
				773	// Clear X & Y dimensions on execution window as we handle manually
				774	Window win = window;
				775	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				776	win.set(Window::DimY, Window::Dimension(0, 1, 1));
				777
				778	Iterator in1_it(input1, window);
				779	Iterator in2_it(input2, window);
				780	Iterator out_it(final_output, window);
				781
				782	const size_t width = window.num_iterations(0);
				783	const size_t height = window.num_iterations(1);
				784
				785	if(add_output != nullptr)
				786	{
				787	Iterator add_out_it(add_output, window);
				788	execute_window_loop(
				789	win, [&](const Coordinates &)
				790	{
				791	a64_add_bn_clamp_direct_s8_fp32_2x16(
				792	reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
				793	reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride,
				794	reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
				795	reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
				796	bn_mul_buffer,
				797	bn_add_buffer,
				798	minval,
				799	maxval,
				800	out_offset, out_scale,
				801	out_direct_offset, out_direct_scale,
				802	in1_offset, in1_scale,
				803	in2_offset, in2_scale,
				804	width, height);
				805	},
				806	in1_it, in2_it, add_out_it, out_it);
				807	}
				808	else
				809	{
				810	execute_window_loop(
				811	win, [&](const Coordinates &)
				812	{
				813	a64_add_bn_clamp_direct_s8_fp32_2x16(
				814	reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
				815	nullptr, out_direct_stride,
				816	reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
				817	reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
				818	bn_mul_buffer,
				819	bn_add_buffer,
				820	minval,
				821	maxval,
				822	out_offset, out_scale,
				823	out_direct_offset, out_direct_scale,
				824	in1_offset, in1_scale,
				825	in2_offset, in2_scale,
				826	width, height);
				827	},
				828	in1_it, in2_it, out_it);
				829	}
				830	}
				831	} // namespace cpu
				832	} // namespace arm_compute
				833
				834	#endif // __aarch64__