| /* |
| * Copyright (c) 2024 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| #pragma once |
| #ifdef ARM_COMPUTE_ENABLE_SVE |
| |
| template<> |
| void MergeResults<3, 8, true>( |
| bfloat16 *out_ptr, |
| const float * in_ptr, |
| const int ldout, |
| const int y0, const int ymax, |
| const int x0, const int xmax, |
| const bfloat16 *bias, |
| Activation act, |
| bool accumulate) |
| { |
| float maxval = static_cast<float>(std::numeric_limits<float>::infinity()); |
| float minval = - static_cast<float>(std::numeric_limits<float>::infinity()); |
| |
| switch(act.type) { |
| default: |
| case Activation::Type::None: |
| break; |
| case Activation::Type::BoundedReLU: |
| maxval = static_cast<float>(act.param1); |
| /* fall through */ |
| case Activation::Type::ReLU: |
| minval = 0; |
| break; |
| } |
| |
| size_t rows = ymax-y0; |
| size_t cols = xmax-x0; |
| |
| out_ptr += (y0 * ldout) + x0; |
| bias = (bias == nullptr) ? nullptr : bias + x0; |
| |
| __asm__ __volatile__( |
| "ptrue p3.b\n" |
| "cbz %x[cols], 52f\n" |
| "cbz %x[rows], 52f\n" |
| "mov x12, #0x20\n" |
| "dup z12.s, %w[maxval]\n" |
| "dup z11.s, %w[minval]\n" |
| "mul x12, %x[ldout], x12\n" |
| "cbnz %x[accumulate], 34f\n" |
| "1:" // Initial: Row loop |
| "cmp %x[rows], #0x7\n" |
| "bgt 30f\n" |
| "beq 26f\n" |
| "cmp %x[rows], #0x5\n" |
| "bgt 22f\n" |
| "beq 18f\n" |
| "cmp %x[rows], #0x3\n" |
| "bgt 14f\n" |
| "beq 10f\n" |
| "cmp %x[rows], #0x1\n" |
| "bgt 6f\n" |
| "2:" // Initial: Height 1 |
| "mov x11, %x[cols]\n" |
| "mov x10, %x[out_ptr]\n" |
| "mov x9, %x[bias]\n" |
| "3:" // Initial: Height 1: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 4f\n" |
| "mov z21.b, #0x0\n" |
| "mov z20.b, #0x0\n" |
| "mov z19.b, #0x0\n" |
| "b 5f\n" |
| "4:" // Initial: Height 1: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z21.s, z18.s, #0x10\n" |
| "lsl z20.s, z17.s, #0x10\n" |
| "lsl z19.s, z16.s, #0x10\n" |
| "5:" // Initial: Height 1: Width 3: init done |
| "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z18.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "fadd z17.s, z17.s, z21.s\n" |
| "fadd z16.s, z16.s, z20.s\n" |
| "cmp x11, XZR\n" |
| "fadd z18.s, z18.s, z19.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| "st1h { z17.s }, p2, [x10]\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| "bgt 3b\n" |
| "b 52f\n" |
| "6:" // Initial: Height 2 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "7:" // Initial: Height 2: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 8f\n" |
| "mov z24.b, #0x0\n" |
| "mov z23.b, #0x0\n" |
| "mov z22.b, #0x0\n" |
| "b 9f\n" |
| "8:" // Initial: Height 2: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z24.s, z18.s, #0x10\n" |
| "lsl z23.s, z17.s, #0x10\n" |
| "lsl z22.s, z16.s, #0x10\n" |
| "9:" // Initial: Height 2: Width 3: init done |
| "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "fadd z17.s, z17.s, z24.s\n" |
| "fadd z16.s, z16.s, z23.s\n" |
| "cmp x11, XZR\n" |
| "fadd z19.s, z19.s, z22.s\n" |
| "fadd z18.s, z18.s, z24.s\n" |
| "fadd z21.s, z21.s, z23.s\n" |
| "fadd z20.s, z20.s, z22.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "st1h { z17.s }, p2, [x10]\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aaeb1 // bfcvt z17.h, p3/M, z21.s\n" |
| ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" |
| "st1h { z19.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x28]\n" |
| "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| "bgt 7b\n" |
| "b 52f\n" |
| "10:" // Initial: Height 3 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "11:" // Initial: Height 3: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 12f\n" |
| "mov z27.b, #0x0\n" |
| "mov z26.b, #0x0\n" |
| "mov z25.b, #0x0\n" |
| "b 13f\n" |
| "12:" // Initial: Height 3: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z27.s, z18.s, #0x10\n" |
| "lsl z26.s, z17.s, #0x10\n" |
| "lsl z25.s, z16.s, #0x10\n" |
| "13:" // Initial: Height 3: Width 3: init done |
| "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z19.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z18.s, z18.s, z27.s\n" |
| "fadd z17.s, z17.s, z26.s\n" |
| "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "fadd z16.s, z16.s, z25.s\n" |
| "fadd z21.s, z21.s, z27.s\n" |
| "cmp x11, XZR\n" |
| "fadd z20.s, z20.s, z26.s\n" |
| "fadd z19.s, z19.s, z25.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "fadd z24.s, z24.s, z27.s\n" |
| "fadd z23.s, z23.s, z26.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fadd z22.s, z22.s, z25.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" |
| "st1h { z18.s }, p2, [x10]\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aaf12 // bfcvt z18.h, p3/M, z24.s\n" |
| "st1h { z17.s }, p1, [x10, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" |
| ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z21.s }, p2, [x28]\n" |
| "st1h { z20.s }, p1, [x28, #1, MUL VL]\n" |
| "st1h { z19.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x27]\n" |
| "st1h { z17.s }, p1, [x27, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "bgt 11b\n" |
| "b 52f\n" |
| "14:" // Initial: Height 4 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "15:" // Initial: Height 4: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 16f\n" |
| "mov z30.b, #0x0\n" |
| "mov z29.b, #0x0\n" |
| "mov z28.b, #0x0\n" |
| "b 17f\n" |
| "16:" // Initial: Height 4: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z30.s, z18.s, #0x10\n" |
| "lsl z29.s, z17.s, #0x10\n" |
| "lsl z28.s, z16.s, #0x10\n" |
| "17:" // Initial: Height 4: Width 3: init done |
| "ld1w { z18.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z17.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z16.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z24.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z20.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z18.s, z18.s, z30.s\n" |
| "fadd z17.s, z17.s, z29.s\n" |
| "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z27.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "fadd z16.s, z16.s, z28.s\n" |
| "fadd z24.s, z24.s, z30.s\n" |
| "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fadd z23.s, z23.s, z29.s\n" |
| "fadd z22.s, z22.s, z28.s\n" |
| "fadd z21.s, z21.s, z30.s\n" |
| "fadd z20.s, z20.s, z29.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fadd z19.s, z19.s, z28.s\n" |
| "fadd z27.s, z27.s, z30.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fadd z26.s, z26.s, z29.s\n" |
| "fadd z25.s, z25.s, z28.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" |
| ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" |
| "cmp x11, XZR\n" |
| "st1h { z18.s }, p2, [x10]\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" |
| "st1h { z17.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" |
| ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z24.s }, p2, [x28]\n" |
| "st1h { z23.s }, p1, [x28, #1, MUL VL]\n" |
| "st1h { z22.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z21.s }, p2, [x27]\n" |
| "st1h { z20.s }, p1, [x27, #1, MUL VL]\n" |
| "st1h { z19.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x26]\n" |
| "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "bgt 15b\n" |
| "b 52f\n" |
| "18:" // Initial: Height 5 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "19:" // Initial: Height 5: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 20f\n" |
| "mov z1.b, #0x0\n" |
| "mov z0.b, #0x0\n" |
| "mov z31.b, #0x0\n" |
| "b 21f\n" |
| "20:" // Initial: Height 5: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z1.s, z18.s, #0x10\n" |
| "lsl z0.s, z17.s, #0x10\n" |
| "lsl z31.s, z16.s, #0x10\n" |
| "21:" // Initial: Height 5: Width 3: init done |
| "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z24.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z23.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z21.s, z21.s, z1.s\n" |
| "fadd z20.s, z20.s, z0.s\n" |
| "ld1w { z22.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z30.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "fadd z19.s, z19.s, z31.s\n" |
| "fadd z18.s, z18.s, z1.s\n" |
| "ld1w { z29.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "ld1w { z28.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fadd z17.s, z17.s, z0.s\n" |
| "fadd z16.s, z16.s, z31.s\n" |
| "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "fadd z24.s, z24.s, z1.s\n" |
| "fadd z23.s, z23.s, z0.s\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "fadd z22.s, z22.s, z31.s\n" |
| "fadd z30.s, z30.s, z1.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fadd z29.s, z29.s, z0.s\n" |
| "fadd z28.s, z28.s, z31.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fadd z27.s, z27.s, z1.s\n" |
| "fadd z26.s, z26.s, z0.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fadd z25.s, z25.s, z31.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "st1h { z21.s }, p2, [x10]\n" |
| ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "cmp x11, XZR\n" |
| "st1h { z20.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" |
| "st1h { z19.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" |
| ".inst 0x658aafd5 // bfcvt z21.h, p3/M, z30.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x28]\n" |
| ".inst 0x658aafb4 // bfcvt z20.h, p3/M, z29.s\n" |
| ".inst 0x658aaf93 // bfcvt z19.h, p3/M, z28.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n" |
| ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z24.s }, p2, [x27]\n" |
| "st1h { z23.s }, p1, [x27, #1, MUL VL]\n" |
| "st1h { z22.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z21.s }, p2, [x26]\n" |
| "st1h { z20.s }, p1, [x26, #1, MUL VL]\n" |
| "st1h { z19.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x25]\n" |
| "st1h { z17.s }, p1, [x25, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "bgt 19b\n" |
| "b 52f\n" |
| "22:" // Initial: Height 6 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "add x24, x25, %x[ldout], LSL #1\n" |
| "23:" // Initial: Height 6: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 24f\n" |
| "mov z4.b, #0x0\n" |
| "mov z3.b, #0x0\n" |
| "mov z2.b, #0x0\n" |
| "b 25f\n" |
| "24:" // Initial: Height 6: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z4.s, z18.s, #0x10\n" |
| "lsl z3.s, z17.s, #0x10\n" |
| "lsl z2.s, z16.s, #0x10\n" |
| "25:" // Initial: Height 6: Width 3: init done |
| "ld1w { z17.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z16.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z20.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z19.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z1.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z0.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z17.s, z17.s, z4.s\n" |
| "fadd z16.s, z16.s, z3.s\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z24.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "fadd z21.s, z21.s, z2.s\n" |
| "fadd z20.s, z20.s, z4.s\n" |
| "ld1w { z23.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fadd z19.s, z19.s, z3.s\n" |
| "fadd z18.s, z18.s, z2.s\n" |
| "ld1w { z31.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "ld1w { z30.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "fadd z1.s, z1.s, z4.s\n" |
| "fadd z0.s, z0.s, z3.s\n" |
| "ld1w { z29.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "ld1w { z28.s }, p2/Z, [x20, #-1, MUL VL]\n" |
| "fadd z25.s, z25.s, z2.s\n" |
| "fadd z24.s, z24.s, z4.s\n" |
| "ld1w { z27.s }, p1/Z, [x20]\n" |
| "ld1w { z26.s }, p0/Z, [x20, #1, MUL VL]\n" |
| "fadd z23.s, z23.s, z3.s\n" |
| "fadd z22.s, z22.s, z2.s\n" |
| "fadd z31.s, z31.s, z4.s\n" |
| "fadd z30.s, z30.s, z3.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fadd z29.s, z29.s, z2.s\n" |
| "fadd z28.s, z28.s, z4.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fadd z27.s, z27.s, z3.s\n" |
| "fadd z26.s, z26.s, z2.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "st1h { z17.s }, p2, [x10]\n" |
| ".inst 0x658aac31 // bfcvt z17.h, p3/M, z1.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aac10 // bfcvt z16.h, p3/M, z0.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "st1h { z21.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n" |
| "cmp x11, XZR\n" |
| "st1h { z20.s }, p2, [x28]\n" |
| ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" |
| ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" |
| "st1h { z19.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" |
| ".inst 0x658aaff5 // bfcvt z21.h, p3/M, z31.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z18.s }, p0, [x28, #2, MUL VL]\n" |
| ".inst 0x658aafd4 // bfcvt z20.h, p3/M, z30.s\n" |
| ".inst 0x658aafb3 // bfcvt z19.h, p3/M, z29.s\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z17.s }, p2, [x27]\n" |
| ".inst 0x658aaf92 // bfcvt z18.h, p3/M, z28.s\n" |
| ".inst 0x658aaf71 // bfcvt z17.h, p3/M, z27.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z16.s }, p1, [x27, #1, MUL VL]\n" |
| ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n" |
| "st1h { z25.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z24.s }, p2, [x26]\n" |
| "st1h { z23.s }, p1, [x26, #1, MUL VL]\n" |
| "st1h { z22.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z21.s }, p2, [x25]\n" |
| "st1h { z20.s }, p1, [x25, #1, MUL VL]\n" |
| "st1h { z19.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x24]\n" |
| "st1h { z17.s }, p1, [x24, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x24, #2, MUL VL]\n" |
| "inch x24, ALL, MUL #3\n" |
| "bgt 23b\n" |
| "b 52f\n" |
| "26:" // Initial: Height 7 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "add x24, x25, %x[ldout], LSL #1\n" |
| "add x23, x24, %x[ldout], LSL #1\n" |
| "27:" // Initial: Height 7: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 28f\n" |
| "mov z7.b, #0x0\n" |
| "mov z6.b, #0x0\n" |
| "mov z5.b, #0x0\n" |
| "b 29f\n" |
| "28:" // Initial: Height 7: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z7.s, z18.s, #0x10\n" |
| "lsl z6.s, z17.s, #0x10\n" |
| "lsl z5.s, z16.s, #0x10\n" |
| "29:" // Initial: Height 7: Width 3: init done |
| "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z16.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z21.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z3.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z19.s, z19.s, z7.s\n" |
| "fadd z18.s, z18.s, z6.s\n" |
| "ld1w { z2.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z1.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "fadd z17.s, z17.s, z5.s\n" |
| "fadd z16.s, z16.s, z7.s\n" |
| "ld1w { z26.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fadd z21.s, z21.s, z6.s\n" |
| "fadd z20.s, z20.s, z5.s\n" |
| "ld1w { z24.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "ld1w { z23.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "fadd z4.s, z4.s, z7.s\n" |
| "fadd z3.s, z3.s, z6.s\n" |
| "ld1w { z22.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "ld1w { z0.s }, p2/Z, [x20, #-1, MUL VL]\n" |
| "fadd z2.s, z2.s, z5.s\n" |
| "fadd z1.s, z1.s, z7.s\n" |
| "ld1w { z31.s }, p1/Z, [x20]\n" |
| "ld1w { z30.s }, p0/Z, [x20, #1, MUL VL]\n" |
| "fadd z26.s, z26.s, z6.s\n" |
| "fadd z25.s, z25.s, z5.s\n" |
| "ld1w { z29.s }, p2/Z, [x20, #2, MUL VL]\n" |
| "ld1w { z28.s }, p1/Z, [x20, #3, MUL VL]\n" |
| "fadd z24.s, z24.s, z7.s\n" |
| "fadd z23.s, z23.s, z6.s\n" |
| "ld1w { z27.s }, p0/Z, [x20, #4, MUL VL]\n" |
| "fadd z22.s, z22.s, z5.s\n" |
| "fadd z0.s, z0.s, z7.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fadd z31.s, z31.s, z6.s\n" |
| "fadd z30.s, z30.s, z5.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fadd z29.s, z29.s, z7.s\n" |
| "fadd z28.s, z28.s, z6.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fadd z27.s, z27.s, z5.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmin z4.s, p3/M, z4.s, z12.s\n" |
| "fmin z3.s, p3/M, z3.s, z12.s\n" |
| "fmin z2.s, p3/M, z2.s, z12.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z4.s, p3/M, z4.s, z11.s\n" |
| "fmax z3.s, p3/M, z3.s, z11.s\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "fmax z2.s, p3/M, z2.s, z11.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "st1h { z19.s }, p2, [x10]\n" |
| ".inst 0x658aac93 // bfcvt z19.h, p3/M, z4.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| "st1h { z18.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aac72 // bfcvt z18.h, p3/M, z3.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aac51 // bfcvt z17.h, p3/M, z2.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "st1h { z16.s }, p2, [x28]\n" |
| ".inst 0x658aac30 // bfcvt z16.h, p3/M, z1.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "cmp x11, XZR\n" |
| "st1h { z21.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aaf5a // bfcvt z26.h, p3/M, z26.s\n" |
| "st1h { z20.s }, p0, [x28, #2, MUL VL]\n" |
| ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n" |
| ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z19.s }, p2, [x27]\n" |
| ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" |
| ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z18.s }, p1, [x27, #1, MUL VL]\n" |
| ".inst 0x658aac15 // bfcvt z21.h, p3/M, z0.s\n" |
| ".inst 0x658aaff4 // bfcvt z20.h, p3/M, z31.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z17.s }, p0, [x27, #2, MUL VL]\n" |
| ".inst 0x658aafd3 // bfcvt z19.h, p3/M, z30.s\n" |
| ".inst 0x658aafb2 // bfcvt z18.h, p3/M, z29.s\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z16.s }, p2, [x26]\n" |
| ".inst 0x658aaf91 // bfcvt z17.h, p3/M, z28.s\n" |
| ".inst 0x658aaf70 // bfcvt z16.h, p3/M, z27.s\n" |
| "st1h { z26.s }, p1, [x26, #1, MUL VL]\n" |
| "st1h { z25.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z24.s }, p2, [x25]\n" |
| "st1h { z23.s }, p1, [x25, #1, MUL VL]\n" |
| "st1h { z22.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "st1h { z21.s }, p2, [x24]\n" |
| "st1h { z20.s }, p1, [x24, #1, MUL VL]\n" |
| "st1h { z19.s }, p0, [x24, #2, MUL VL]\n" |
| "inch x24, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x23]\n" |
| "st1h { z17.s }, p1, [x23, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x23, #2, MUL VL]\n" |
| "inch x23, ALL, MUL #3\n" |
| "bgt 27b\n" |
| "b 52f\n" |
| "30:" // Initial: Height 8 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "mov x9, %x[bias]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "add x24, x25, %x[ldout], LSL #1\n" |
| "add x23, x24, %x[ldout], LSL #1\n" |
| "add x22, x23, %x[ldout], LSL #1\n" |
| "31:" // Initial: Height 8: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "whilelt p0.s, x21, x11\n" |
| "incw x21\n" |
| "cbnz %x[bias], 32f\n" |
| "mov z10.b, #0x0\n" |
| "mov z9.b, #0x0\n" |
| "mov z8.b, #0x0\n" |
| "b 33f\n" |
| "32:" // Initial: Height 8: Width 3: bias |
| "ld1h { z18.s }, p2/Z, [x9]\n" |
| "ld1h { z17.s }, p1/Z, [x9, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x9, #2, MUL VL]\n" |
| "lsl z10.s, z18.s, #0x10\n" |
| "lsl z9.s, z17.s, #0x10\n" |
| "lsl z8.s, z16.s, #0x10\n" |
| "33:" // Initial: Height 8: Width 3: init done |
| "ld1w { z21.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z20.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "inch x9, ALL, MUL #3\n" |
| "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z18.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z17.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z16.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z6.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z21.s, z21.s, z10.s\n" |
| "fadd z20.s, z20.s, z9.s\n" |
| "ld1w { z5.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z4.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "fadd z19.s, z19.s, z8.s\n" |
| "fadd z18.s, z18.s, z10.s\n" |
| "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "ld1w { z2.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fadd z17.s, z17.s, z9.s\n" |
| "fadd z16.s, z16.s, z8.s\n" |
| "ld1w { z27.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "fadd z7.s, z7.s, z10.s\n" |
| "fadd z6.s, z6.s, z9.s\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n" |
| "fadd z5.s, z5.s, z8.s\n" |
| "fadd z4.s, z4.s, z10.s\n" |
| "ld1w { z23.s }, p1/Z, [x20]\n" |
| "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n" |
| "fadd z3.s, z3.s, z9.s\n" |
| "fadd z2.s, z2.s, z8.s\n" |
| "ld1w { z1.s }, p2/Z, [x20, #2, MUL VL]\n" |
| "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" |
| "fadd z27.s, z27.s, z10.s\n" |
| "fadd z26.s, z26.s, z9.s\n" |
| "ld1w { z31.s }, p0/Z, [x20, #4, MUL VL]\n" |
| "ld1w { z30.s }, p2/Z, [x20, #5, MUL VL]\n" |
| "fadd z25.s, z25.s, z8.s\n" |
| "fadd z24.s, z24.s, z10.s\n" |
| "ld1w { z29.s }, p1/Z, [x20, #6, MUL VL]\n" |
| "ld1w { z28.s }, p0/Z, [x20, #7, MUL VL]\n" |
| "fadd z23.s, z23.s, z9.s\n" |
| "fadd z22.s, z22.s, z8.s\n" |
| "fadd z1.s, z1.s, z10.s\n" |
| "fadd z0.s, z0.s, z9.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fadd z31.s, z31.s, z8.s\n" |
| "fadd z30.s, z30.s, z10.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fadd z29.s, z29.s, z9.s\n" |
| "fadd z28.s, z28.s, z8.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fmin z16.s, p3/M, z16.s, z12.s\n" |
| "fmin z7.s, p3/M, z7.s, z12.s\n" |
| "fmin z6.s, p3/M, z6.s, z12.s\n" |
| "fmin z5.s, p3/M, z5.s, z12.s\n" |
| "fmin z4.s, p3/M, z4.s, z12.s\n" |
| "fmin z3.s, p3/M, z3.s, z12.s\n" |
| "fmin z2.s, p3/M, z2.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "fmax z16.s, p3/M, z16.s, z11.s\n" |
| "fmax z7.s, p3/M, z7.s, z11.s\n" |
| "fmax z6.s, p3/M, z6.s, z11.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aae94 // bfcvt z20.h, p3/M, z20.s\n" |
| "fmax z5.s, p3/M, z5.s, z11.s\n" |
| "fmax z4.s, p3/M, z4.s, z11.s\n" |
| ".inst 0x658aae73 // bfcvt z19.h, p3/M, z19.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "fmax z3.s, p3/M, z3.s, z11.s\n" |
| "fmax z2.s, p3/M, z2.s, z11.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| ".inst 0x658aae10 // bfcvt z16.h, p3/M, z16.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "st1h { z21.s }, p2, [x10]\n" |
| ".inst 0x658aacf5 // bfcvt z21.h, p3/M, z7.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "st1h { z20.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aacd4 // bfcvt z20.h, p3/M, z6.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| "st1h { z19.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aacb3 // bfcvt z19.h, p3/M, z5.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| "st1h { z18.s }, p2, [x28]\n" |
| ".inst 0x658aac92 // bfcvt z18.h, p3/M, z4.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aac71 // bfcvt z17.h, p3/M, z3.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| ".inst 0x658aac50 // bfcvt z16.h, p3/M, z2.s\n" |
| "cmp x11, XZR\n" |
| "st1h { z21.s }, p2, [x27]\n" |
| ".inst 0x658aaf7b // bfcvt z27.h, p3/M, z27.s\n" |
| ".inst 0x658aaf5a // bfcvt z26.h, p3/M, z26.s\n" |
| "st1h { z20.s }, p1, [x27, #1, MUL VL]\n" |
| ".inst 0x658aaf39 // bfcvt z25.h, p3/M, z25.s\n" |
| ".inst 0x658aaf18 // bfcvt z24.h, p3/M, z24.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z19.s }, p0, [x27, #2, MUL VL]\n" |
| ".inst 0x658aaef7 // bfcvt z23.h, p3/M, z23.s\n" |
| ".inst 0x658aaed6 // bfcvt z22.h, p3/M, z22.s\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x26]\n" |
| ".inst 0x658aac35 // bfcvt z21.h, p3/M, z1.s\n" |
| ".inst 0x658aac14 // bfcvt z20.h, p3/M, z0.s\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" |
| ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n" |
| ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z16.s }, p0, [x26, #2, MUL VL]\n" |
| ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" |
| ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z27.s }, p2, [x25]\n" |
| "st1h { z26.s }, p1, [x25, #1, MUL VL]\n" |
| "st1h { z25.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "st1h { z24.s }, p2, [x24]\n" |
| "st1h { z23.s }, p1, [x24, #1, MUL VL]\n" |
| "st1h { z22.s }, p0, [x24, #2, MUL VL]\n" |
| "inch x24, ALL, MUL #3\n" |
| "st1h { z21.s }, p2, [x23]\n" |
| "st1h { z20.s }, p1, [x23, #1, MUL VL]\n" |
| "st1h { z19.s }, p0, [x23, #2, MUL VL]\n" |
| "inch x23, ALL, MUL #3\n" |
| "st1h { z18.s }, p2, [x22]\n" |
| "st1h { z17.s }, p1, [x22, #1, MUL VL]\n" |
| "st1h { z16.s }, p0, [x22, #2, MUL VL]\n" |
| "inch x22, ALL, MUL #3\n" |
| "bgt 31b\n" |
| "subs %x[rows], %x[rows], #0x8\n" |
| "add %x[out_ptr], %x[out_ptr], x12\n" |
| "bgt 1b\n" |
| "b 52f\n" |
| "34:" // Accumulate |
| "35:" // Accumulate: Row loop |
| "cmp %x[rows], #0x7\n" |
| "bgt 50f\n" |
| "beq 48f\n" |
| "cmp %x[rows], #0x5\n" |
| "bgt 46f\n" |
| "beq 44f\n" |
| "cmp %x[rows], #0x3\n" |
| "bgt 42f\n" |
| "beq 40f\n" |
| "cmp %x[rows], #0x1\n" |
| "bgt 38f\n" |
| "36:" // Accumulate: Height 1 |
| "mov x11, %x[cols]\n" |
| "mov x10, %x[out_ptr]\n" |
| "37:" // Accumulate: Height 1: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z16.s }, p2/Z, [x10]\n" |
| "ld1w { z19.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "fadd z19.s, z19.s, z16.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "ld1w { z18.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "whilelt p0.s, x21, x11\n" |
| "decw x11, ALL, MUL #3\n" |
| "incw x21\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "ld1w { z17.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "cmp x11, XZR\n" |
| ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n" |
| "st1h { z16.s }, p2, [x10]\n" |
| "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z18.s, z18.s, z16.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z17.s, z17.s, z16.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| ".inst 0x658aae30 // bfcvt z16.h, p3/M, z17.s\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| "bgt 37b\n" |
| "b 52f\n" |
| "38:" // Accumulate: Height 2 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "39:" // Accumulate: Height 2: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z17.s }, p2/Z, [x10]\n" |
| "ld1h { z16.s }, p2/Z, [x28]\n" |
| "ld1w { z23.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "ld1w { z22.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "fadd z23.s, z23.s, z17.s\n" |
| "fadd z22.s, z22.s, z16.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "ld1w { z21.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z20.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "whilelt p0.s, x21, x11\n" |
| "decw x11, ALL, MUL #3\n" |
| "incw x21\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| "ld1w { z19.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z18.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "cmp x11, XZR\n" |
| ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" |
| ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" |
| "st1h { z17.s }, p2, [x10]\n" |
| "st1h { z16.s }, p2, [x28]\n" |
| "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z21.s, z21.s, z17.s\n" |
| "fadd z20.s, z20.s, z16.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" |
| "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z16.s }, p1, [x28, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z19.s, z19.s, z17.s\n" |
| "fadd z18.s, z18.s, z16.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| ".inst 0x658aae50 // bfcvt z16.h, p3/M, z18.s\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| "bgt 39b\n" |
| "b 52f\n" |
| "40:" // Accumulate: Height 3 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "41:" // Accumulate: Height 3: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z18.s }, p2/Z, [x10]\n" |
| "ld1h { z17.s }, p2/Z, [x28]\n" |
| "ld1h { z16.s }, p2/Z, [x27]\n" |
| "ld1w { z26.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z19.s, z18.s, #0x10\n" |
| "ld1w { z25.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "ld1w { z18.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z26.s, z26.s, z19.s\n" |
| "fadd z25.s, z25.s, z17.s\n" |
| "ld1w { z24.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z23.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z22.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "fadd z18.s, z18.s, z16.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "whilelt p0.s, x21, x11\n" |
| "decw x11, ALL, MUL #3\n" |
| "incw x21\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "ld1w { z21.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z20.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "ld1w { z19.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "cmp x11, XZR\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" |
| ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" |
| "st1h { z17.s }, p2, [x10]\n" |
| "st1h { z16.s }, p2, [x28]\n" |
| ".inst 0x658aae51 // bfcvt z17.h, p3/M, z18.s\n" |
| "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "st1h { z17.s }, p2, [x27]\n" |
| "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "lsl z18.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z24.s, z24.s, z18.s\n" |
| "fadd z23.s, z23.s, z17.s\n" |
| "fadd z22.s, z22.s, z16.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aaf10 // bfcvt z16.h, p3/M, z24.s\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n" |
| ".inst 0x658aaed1 // bfcvt z17.h, p3/M, z22.s\n" |
| "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z18.s }, p1, [x28, #1, MUL VL]\n" |
| "st1h { z17.s }, p1, [x27, #1, MUL VL]\n" |
| "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "lsl z18.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z21.s, z21.s, z18.s\n" |
| "fadd z20.s, z20.s, z17.s\n" |
| "fadd z19.s, z19.s, z16.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| ".inst 0x658aae91 // bfcvt z17.h, p3/M, z20.s\n" |
| ".inst 0x658aae70 // bfcvt z16.h, p3/M, z19.s\n" |
| "st1h { z17.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "bgt 41b\n" |
| "b 52f\n" |
| "42:" // Accumulate: Height 4 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "43:" // Accumulate: Height 4: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z19.s }, p2/Z, [x10]\n" |
| "ld1h { z18.s }, p2/Z, [x28]\n" |
| "ld1h { z17.s }, p2/Z, [x27]\n" |
| "ld1h { z16.s }, p2/Z, [x26]\n" |
| "ld1w { z30.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z20.s, z19.s, #0x10\n" |
| "ld1w { z29.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "ld1w { z28.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z19.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z30.s, z30.s, z20.s\n" |
| "fadd z29.s, z29.s, z18.s\n" |
| "ld1w { z27.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z26.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "ld1w { z25.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "ld1w { z24.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "whilelt p0.s, x21, x11\n" |
| "decw x11, ALL, MUL #3\n" |
| "fadd z28.s, z28.s, z17.s\n" |
| "fadd z19.s, z19.s, z16.s\n" |
| "incw x21\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "ld1w { z23.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z22.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "ld1w { z21.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z20.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "cmp x11, XZR\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z19.s, p3/M, z19.s, z12.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "fmax z19.s, p3/M, z19.s, z11.s\n" |
| ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n" |
| ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" |
| ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" |
| "st1h { z18.s }, p2, [x10]\n" |
| "st1h { z17.s }, p2, [x28]\n" |
| ".inst 0x658aae71 // bfcvt z17.h, p3/M, z19.s\n" |
| "st1h { z16.s }, p2, [x27]\n" |
| "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "st1h { z17.s }, p2, [x26]\n" |
| "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n" |
| "lsl z19.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p1/Z, [x26, #1, MUL VL]\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "fadd z27.s, z27.s, z19.s\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z26.s, z26.s, z18.s\n" |
| "fadd z25.s, z25.s, z17.s\n" |
| "fadd z24.s, z24.s, z16.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| ".inst 0x658aaf71 // bfcvt z17.h, p3/M, z27.s\n" |
| ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n" |
| "st1h { z17.s }, p1, [x10, #1, MUL VL]\n" |
| "st1h { z16.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n" |
| ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" |
| "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z18.s }, p1, [x27, #1, MUL VL]\n" |
| "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" |
| "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "lsl z19.s, z16.s, #0x10\n" |
| "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x26, #2, MUL VL]\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "fadd z23.s, z23.s, z19.s\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z22.s, z22.s, z18.s\n" |
| "fadd z21.s, z21.s, z17.s\n" |
| "fadd z20.s, z20.s, z16.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" |
| ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" |
| "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| ".inst 0x658aaeb1 // bfcvt z17.h, p3/M, z21.s\n" |
| ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" |
| "st1h { z17.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "bgt 43b\n" |
| "b 52f\n" |
| "44:" // Accumulate: Height 5 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "45:" // Accumulate: Height 5: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z20.s }, p2/Z, [x10]\n" |
| "ld1h { z19.s }, p2/Z, [x28]\n" |
| "ld1h { z18.s }, p2/Z, [x27]\n" |
| "ld1h { z17.s }, p2/Z, [x26]\n" |
| "ld1h { z16.s }, p2/Z, [x25]\n" |
| "ld1w { z1.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z22.s, z20.s, #0x10\n" |
| "ld1w { z0.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "lsl z21.s, z19.s, #0x10\n" |
| "ld1w { z31.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "whilelt p1.s, x21, x11\n" |
| "lsl z19.s, z18.s, #0x10\n" |
| "ld1w { z20.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "lsl z18.s, z17.s, #0x10\n" |
| "ld1w { z17.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z1.s, z1.s, z22.s\n" |
| "incw x21\n" |
| "fadd z0.s, z0.s, z21.s\n" |
| "ld1w { z30.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z29.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "fadd z31.s, z31.s, z19.s\n" |
| "fadd z20.s, z20.s, z18.s\n" |
| "ld1w { z28.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "ld1w { z27.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "fadd z17.s, z17.s, z16.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "ld1w { z26.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "whilelt p0.s, x21, x11\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmin z20.s, p3/M, z20.s, z12.s\n" |
| "fmin z17.s, p3/M, z17.s, z12.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| "ld1w { z25.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z24.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| "ld1w { z23.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z22.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fmax z20.s, p3/M, z20.s, z11.s\n" |
| "fmax z17.s, p3/M, z17.s, z11.s\n" |
| "ld1w { z21.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| ".inst 0x658aac30 // bfcvt z16.h, p3/M, z1.s\n" |
| "cmp x11, XZR\n" |
| "incw x21\n" |
| ".inst 0x658aac13 // bfcvt z19.h, p3/M, z0.s\n" |
| ".inst 0x658aaff2 // bfcvt z18.h, p3/M, z31.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z16.s }, p2, [x10]\n" |
| ".inst 0x658aae90 // bfcvt z16.h, p3/M, z20.s\n" |
| ".inst 0x658aae31 // bfcvt z17.h, p3/M, z17.s\n" |
| "st1h { z19.s }, p2, [x28]\n" |
| "st1h { z18.s }, p2, [x27]\n" |
| "st1h { z16.s }, p2, [x26]\n" |
| "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "st1h { z17.s }, p2, [x25]\n" |
| "ld1h { z19.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "ld1h { z18.s }, p1/Z, [x27, #1, MUL VL]\n" |
| "ld1h { z17.s }, p1/Z, [x26, #1, MUL VL]\n" |
| "lsl z20.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p1/Z, [x25, #1, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z30.s, z30.s, z20.s\n" |
| "fadd z29.s, z29.s, z19.s\n" |
| "fadd z28.s, z28.s, z18.s\n" |
| "fadd z27.s, z27.s, z17.s\n" |
| "fadd z26.s, z26.s, z16.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| ".inst 0x658aafd2 // bfcvt z18.h, p3/M, z30.s\n" |
| ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" |
| ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" |
| "st1h { z18.s }, p1, [x10, #1, MUL VL]\n" |
| "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aaf72 // bfcvt z18.h, p3/M, z27.s\n" |
| ".inst 0x658aaf51 // bfcvt z17.h, p3/M, z26.s\n" |
| "st1h { z16.s }, p1, [x27, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z18.s }, p1, [x26, #1, MUL VL]\n" |
| "st1h { z17.s }, p1, [x25, #1, MUL VL]\n" |
| "ld1h { z19.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "ld1h { z18.s }, p0/Z, [x27, #2, MUL VL]\n" |
| "lsl z20.s, z16.s, #0x10\n" |
| "ld1h { z17.s }, p0/Z, [x26, #2, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x25, #2, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z25.s, z25.s, z20.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z24.s, z24.s, z19.s\n" |
| "fadd z23.s, z23.s, z18.s\n" |
| "fadd z22.s, z22.s, z17.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fadd z21.s, z21.s, z16.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aaf31 // bfcvt z17.h, p3/M, z25.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| ".inst 0x658aaf10 // bfcvt z16.h, p3/M, z24.s\n" |
| "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n" |
| ".inst 0x658aaed1 // bfcvt z17.h, p3/M, z22.s\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| ".inst 0x658aaeb0 // bfcvt z16.h, p3/M, z21.s\n" |
| "st1h { z18.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z17.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "bgt 45b\n" |
| "b 52f\n" |
| "46:" // Accumulate: Height 6 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "add x24, x25, %x[ldout], LSL #1\n" |
| "47:" // Accumulate: Height 6: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z21.s }, p2/Z, [x10]\n" |
| "ld1h { z20.s }, p2/Z, [x28]\n" |
| "ld1h { z19.s }, p2/Z, [x27]\n" |
| "ld1h { z18.s }, p2/Z, [x26]\n" |
| "ld1h { z17.s }, p2/Z, [x25]\n" |
| "ld1h { z16.s }, p2/Z, [x24]\n" |
| "ld1w { z6.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z22.s, z21.s, #0x10\n" |
| "ld1w { z5.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "lsl z21.s, z20.s, #0x10\n" |
| "ld1w { z4.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "lsl z20.s, z19.s, #0x10\n" |
| "ld1w { z3.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "lsl z19.s, z18.s, #0x10\n" |
| "ld1w { z2.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "ld1w { z18.s }, p2/Z, [x20, #-1, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z6.s, z6.s, z22.s\n" |
| "fadd z5.s, z5.s, z21.s\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "fadd z4.s, z4.s, z20.s\n" |
| "fadd z3.s, z3.s, z19.s\n" |
| "fadd z2.s, z2.s, z17.s\n" |
| "fadd z18.s, z18.s, z16.s\n" |
| "fmin z6.s, p3/M, z6.s, z12.s\n" |
| "fmin z5.s, p3/M, z5.s, z12.s\n" |
| "ld1w { z1.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z0.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "whilelt p0.s, x21, x11\n" |
| "decw x11, ALL, MUL #3\n" |
| "fmin z4.s, p3/M, z4.s, z12.s\n" |
| "fmin z3.s, p3/M, z3.s, z12.s\n" |
| "ld1w { z31.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "ld1w { z30.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "fmin z2.s, p3/M, z2.s, z12.s\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "ld1w { z29.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "ld1w { z28.s }, p1/Z, [x20]\n" |
| "fmax z6.s, p3/M, z6.s, z11.s\n" |
| "fmax z5.s, p3/M, z5.s, z11.s\n" |
| "ld1w { z27.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z26.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "fmax z4.s, p3/M, z4.s, z11.s\n" |
| "fmax z3.s, p3/M, z3.s, z11.s\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "ld1w { z24.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "fmax z2.s, p3/M, z2.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "ld1w { z23.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "ld1w { z22.s }, p0/Z, [x20, #1, MUL VL]\n" |
| ".inst 0x658aacd5 // bfcvt z21.h, p3/M, z6.s\n" |
| ".inst 0x658aacb4 // bfcvt z20.h, p3/M, z5.s\n" |
| "cmp x11, XZR\n" |
| "incw x21\n" |
| ".inst 0x658aac93 // bfcvt z19.h, p3/M, z4.s\n" |
| ".inst 0x658aac71 // bfcvt z17.h, p3/M, z3.s\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| ".inst 0x658aac50 // bfcvt z16.h, p3/M, z2.s\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| "st1h { z21.s }, p2, [x10]\n" |
| "st1h { z20.s }, p2, [x28]\n" |
| "st1h { z19.s }, p2, [x27]\n" |
| "st1h { z17.s }, p2, [x26]\n" |
| "ld1h { z17.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "st1h { z16.s }, p2, [x25]\n" |
| "ld1h { z16.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "st1h { z18.s }, p2, [x24]\n" |
| "ld1h { z19.s }, p1/Z, [x27, #1, MUL VL]\n" |
| "ld1h { z18.s }, p1/Z, [x26, #1, MUL VL]\n" |
| "lsl z21.s, z17.s, #0x10\n" |
| "ld1h { z17.s }, p1/Z, [x25, #1, MUL VL]\n" |
| "lsl z20.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p1/Z, [x24, #1, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z1.s, z1.s, z21.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z0.s, z0.s, z20.s\n" |
| "fadd z31.s, z31.s, z19.s\n" |
| "fadd z30.s, z30.s, z18.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "fadd z29.s, z29.s, z17.s\n" |
| "fadd z28.s, z28.s, z16.s\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| ".inst 0x658aac34 // bfcvt z20.h, p3/M, z1.s\n" |
| ".inst 0x658aac12 // bfcvt z18.h, p3/M, z0.s\n" |
| ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n" |
| ".inst 0x658aafd1 // bfcvt z17.h, p3/M, z30.s\n" |
| ".inst 0x658aafb0 // bfcvt z16.h, p3/M, z29.s\n" |
| "st1h { z20.s }, p1, [x10, #1, MUL VL]\n" |
| "st1h { z18.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aaf92 // bfcvt z18.h, p3/M, z28.s\n" |
| "st1h { z19.s }, p1, [x27, #1, MUL VL]\n" |
| "st1h { z17.s }, p1, [x26, #1, MUL VL]\n" |
| "ld1h { z17.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z16.s }, p1, [x25, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "st1h { z18.s }, p1, [x24, #1, MUL VL]\n" |
| "ld1h { z19.s }, p0/Z, [x27, #2, MUL VL]\n" |
| "ld1h { z18.s }, p0/Z, [x26, #2, MUL VL]\n" |
| "lsl z21.s, z17.s, #0x10\n" |
| "ld1h { z17.s }, p0/Z, [x25, #2, MUL VL]\n" |
| "lsl z20.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p0/Z, [x24, #2, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z27.s, z27.s, z21.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z26.s, z26.s, z20.s\n" |
| "fadd z25.s, z25.s, z19.s\n" |
| "fadd z24.s, z24.s, z18.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fadd z23.s, z23.s, z17.s\n" |
| "fadd z22.s, z22.s, z16.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aaf74 // bfcvt z20.h, p3/M, z27.s\n" |
| ".inst 0x658aaf50 // bfcvt z16.h, p3/M, z26.s\n" |
| ".inst 0x658aaf33 // bfcvt z19.h, p3/M, z25.s\n" |
| ".inst 0x658aaf12 // bfcvt z18.h, p3/M, z24.s\n" |
| ".inst 0x658aaef1 // bfcvt z17.h, p3/M, z23.s\n" |
| "st1h { z20.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x28, #2, MUL VL]\n" |
| ".inst 0x658aaed0 // bfcvt z16.h, p3/M, z22.s\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z19.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z18.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z17.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x24, #2, MUL VL]\n" |
| "inch x24, ALL, MUL #3\n" |
| "bgt 47b\n" |
| "b 52f\n" |
| "48:" // Accumulate: Height 7 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "add x24, x25, %x[ldout], LSL #1\n" |
| "add x23, x24, %x[ldout], LSL #1\n" |
| "49:" // Accumulate: Height 7: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z22.s }, p2/Z, [x10]\n" |
| "ld1h { z21.s }, p2/Z, [x28]\n" |
| "ld1h { z20.s }, p2/Z, [x27]\n" |
| "ld1h { z19.s }, p2/Z, [x26]\n" |
| "ld1h { z18.s }, p2/Z, [x25]\n" |
| "ld1h { z17.s }, p2/Z, [x24]\n" |
| "ld1h { z16.s }, p2/Z, [x23]\n" |
| "ld1w { z8.s }, p2/Z, [%x[in_ptr]]\n" |
| "lsl z25.s, z22.s, #0x10\n" |
| "lsl z24.s, z21.s, #0x10\n" |
| "ld1w { z21.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "ld1w { z7.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "lsl z20.s, z20.s, #0x10\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "ld1w { z23.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "ld1w { z6.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "ld1w { z5.s }, p2/Z, [x20, #-1, MUL VL]\n" |
| "ld1w { z22.s }, p2/Z, [x20, #2, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z8.s, z8.s, z25.s\n" |
| "fadd z21.s, z21.s, z24.s\n" |
| "fadd z7.s, z7.s, z20.s\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "fadd z23.s, z23.s, z19.s\n" |
| "fadd z6.s, z6.s, z18.s\n" |
| "fadd z5.s, z5.s, z17.s\n" |
| "fadd z22.s, z22.s, z16.s\n" |
| "fmin z8.s, p3/M, z8.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z7.s, p3/M, z7.s, z12.s\n" |
| "ld1w { z4.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z3.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "whilelt p0.s, x21, x11\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z6.s, p3/M, z6.s, z12.s\n" |
| "ld1w { z2.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "ld1w { z1.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "fmin z5.s, p3/M, z5.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "ld1w { z0.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "ld1w { z31.s }, p1/Z, [x20]\n" |
| "fmax z8.s, p3/M, z8.s, z11.s\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "ld1w { z30.s }, p1/Z, [x20, #3, MUL VL]\n" |
| "ld1w { z29.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "fmax z7.s, p3/M, z7.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "ld1w { z28.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "ld1w { z27.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "fmax z6.s, p3/M, z6.s, z11.s\n" |
| "fmax z5.s, p3/M, z5.s, z11.s\n" |
| "ld1w { z26.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "ld1w { z25.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aad13 // bfcvt z19.h, p3/M, z8.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| "ld1w { z24.s }, p0/Z, [x20, #1, MUL VL]\n" |
| ".inst 0x658aacf4 // bfcvt z20.h, p3/M, z7.s\n" |
| ".inst 0x658aaef2 // bfcvt z18.h, p3/M, z23.s\n" |
| "ld1w { z23.s }, p0/Z, [x20, #4, MUL VL]\n" |
| "decw x11, ALL, MUL #3\n" |
| ".inst 0x658aacd1 // bfcvt z17.h, p3/M, z6.s\n" |
| ".inst 0x658aacb0 // bfcvt z16.h, p3/M, z5.s\n" |
| "incw x21\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z19.s }, p2, [x10]\n" |
| ".inst 0x658aaed3 // bfcvt z19.h, p3/M, z22.s\n" |
| "st1h { z21.s }, p2, [x28]\n" |
| "cmp x11, XZR\n" |
| "st1h { z20.s }, p2, [x27]\n" |
| "st1h { z18.s }, p2, [x26]\n" |
| "ld1h { z18.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "st1h { z17.s }, p2, [x25]\n" |
| "ld1h { z17.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "st1h { z16.s }, p2, [x24]\n" |
| "ld1h { z16.s }, p1/Z, [x27, #1, MUL VL]\n" |
| "st1h { z19.s }, p2, [x23]\n" |
| "ld1h { z19.s }, p1/Z, [x26, #1, MUL VL]\n" |
| "lsl z22.s, z18.s, #0x10\n" |
| "ld1h { z18.s }, p1/Z, [x25, #1, MUL VL]\n" |
| "lsl z21.s, z17.s, #0x10\n" |
| "ld1h { z17.s }, p1/Z, [x24, #1, MUL VL]\n" |
| "lsl z20.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p1/Z, [x23, #1, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z4.s, z4.s, z22.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z3.s, z3.s, z21.s\n" |
| "fadd z2.s, z2.s, z20.s\n" |
| "fadd z1.s, z1.s, z19.s\n" |
| "fadd z0.s, z0.s, z18.s\n" |
| "fadd z31.s, z31.s, z17.s\n" |
| "fmin z4.s, p3/M, z4.s, z12.s\n" |
| "fadd z30.s, z30.s, z16.s\n" |
| "fmin z3.s, p3/M, z3.s, z12.s\n" |
| "fmin z2.s, p3/M, z2.s, z12.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmax z4.s, p3/M, z4.s, z11.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fmax z3.s, p3/M, z3.s, z11.s\n" |
| "fmax z2.s, p3/M, z2.s, z11.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| ".inst 0x658aac90 // bfcvt z16.h, p3/M, z4.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| ".inst 0x658aac74 // bfcvt z20.h, p3/M, z3.s\n" |
| ".inst 0x658aac53 // bfcvt z19.h, p3/M, z2.s\n" |
| ".inst 0x658aac32 // bfcvt z18.h, p3/M, z1.s\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aac11 // bfcvt z17.h, p3/M, z0.s\n" |
| ".inst 0x658aaff0 // bfcvt z16.h, p3/M, z31.s\n" |
| "st1h { z20.s }, p1, [x28, #1, MUL VL]\n" |
| "st1h { z19.s }, p1, [x27, #1, MUL VL]\n" |
| ".inst 0x658aafd3 // bfcvt z19.h, p3/M, z30.s\n" |
| "st1h { z18.s }, p1, [x26, #1, MUL VL]\n" |
| "ld1h { z18.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z17.s }, p1, [x25, #1, MUL VL]\n" |
| "ld1h { z17.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "st1h { z16.s }, p1, [x24, #1, MUL VL]\n" |
| "ld1h { z16.s }, p0/Z, [x27, #2, MUL VL]\n" |
| "st1h { z19.s }, p1, [x23, #1, MUL VL]\n" |
| "ld1h { z19.s }, p0/Z, [x26, #2, MUL VL]\n" |
| "lsl z22.s, z18.s, #0x10\n" |
| "ld1h { z18.s }, p0/Z, [x25, #2, MUL VL]\n" |
| "lsl z21.s, z17.s, #0x10\n" |
| "ld1h { z17.s }, p0/Z, [x24, #2, MUL VL]\n" |
| "lsl z20.s, z16.s, #0x10\n" |
| "ld1h { z16.s }, p0/Z, [x23, #2, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z29.s, z29.s, z22.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z28.s, z28.s, z21.s\n" |
| "fadd z27.s, z27.s, z20.s\n" |
| "fadd z26.s, z26.s, z19.s\n" |
| "fadd z25.s, z25.s, z18.s\n" |
| "fadd z24.s, z24.s, z17.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fadd z23.s, z23.s, z16.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| ".inst 0x658aaf94 // bfcvt z20.h, p3/M, z28.s\n" |
| ".inst 0x658aaf70 // bfcvt z16.h, p3/M, z27.s\n" |
| ".inst 0x658aaf53 // bfcvt z19.h, p3/M, z26.s\n" |
| "st1h { z17.s }, p0, [x10, #2, MUL VL]\n" |
| "inch x10, ALL, MUL #3\n" |
| ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n" |
| ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" |
| "st1h { z20.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" |
| ".inst 0x658aaef0 // bfcvt z16.h, p3/M, z23.s\n" |
| "inch x27, ALL, MUL #3\n" |
| "st1h { z19.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z18.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "st1h { z17.s }, p0, [x24, #2, MUL VL]\n" |
| "inch x24, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x23, #2, MUL VL]\n" |
| "inch x23, ALL, MUL #3\n" |
| "bgt 49b\n" |
| "b 52f\n" |
| "50:" // Accumulate: Height 8 |
| "mov x10, %x[out_ptr]\n" |
| "mov x11, %x[cols]\n" |
| "add x28, x10, %x[ldout], LSL #1\n" |
| "add x27, x28, %x[ldout], LSL #1\n" |
| "add x26, x27, %x[ldout], LSL #1\n" |
| "add x25, x26, %x[ldout], LSL #1\n" |
| "add x24, x25, %x[ldout], LSL #1\n" |
| "add x23, x24, %x[ldout], LSL #1\n" |
| "add x22, x23, %x[ldout], LSL #1\n" |
| "51:" // Accumulate: Height 8: Block loop |
| "mov x21, #0x0\n" |
| "addvl x20, %x[in_ptr], #16\n" |
| "whilelt p2.s, x21, x11\n" |
| "incw x21\n" |
| "ld1h { z23.s }, p2/Z, [x10]\n" |
| "ld1h { z22.s }, p2/Z, [x28]\n" |
| "ld1h { z21.s }, p2/Z, [x27]\n" |
| "ld1h { z20.s }, p2/Z, [x26]\n" |
| "ld1h { z19.s }, p2/Z, [x25]\n" |
| "ld1h { z18.s }, p2/Z, [x24]\n" |
| "ld1h { z17.s }, p2/Z, [x23]\n" |
| "ld1h { z16.s }, p2/Z, [x22]\n" |
| "lsl z31.s, z23.s, #0x10\n" |
| "lsl z30.s, z22.s, #0x10\n" |
| "ld1w { z29.s }, p2/Z, [%x[in_ptr]]\n" |
| "ld1w { z28.s }, p2/Z, [%x[in_ptr], #3, MUL VL]\n" |
| "lsl z27.s, z21.s, #0x10\n" |
| "lsl z26.s, z20.s, #0x10\n" |
| "ld1w { z21.s }, p2/Z, [%x[in_ptr], #6, MUL VL]\n" |
| "ld1w { z25.s }, p2/Z, [x20, #-7, MUL VL]\n" |
| "lsl z20.s, z19.s, #0x10\n" |
| "lsl z19.s, z18.s, #0x10\n" |
| "ld1w { z18.s }, p2/Z, [x20, #-4, MUL VL]\n" |
| "ld1w { z24.s }, p2/Z, [x20, #-1, MUL VL]\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "ld1w { z23.s }, p2/Z, [x20, #2, MUL VL]\n" |
| "ld1w { z22.s }, p2/Z, [x20, #5, MUL VL]\n" |
| "fadd z29.s, z29.s, z31.s\n" |
| "fadd z28.s, z28.s, z30.s\n" |
| "fadd z21.s, z21.s, z27.s\n" |
| "fadd z25.s, z25.s, z26.s\n" |
| "whilelt p1.s, x21, x11\n" |
| "incw x21\n" |
| "fadd z18.s, z18.s, z20.s\n" |
| "fadd z24.s, z24.s, z19.s\n" |
| "fadd z23.s, z23.s, z17.s\n" |
| "fadd z22.s, z22.s, z16.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fmin z21.s, p3/M, z21.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "ld1w { z6.s }, p1/Z, [%x[in_ptr], #1, MUL VL]\n" |
| "ld1w { z5.s }, p1/Z, [%x[in_ptr], #4, MUL VL]\n" |
| "fmin z18.s, p3/M, z18.s, z12.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "ld1w { z4.s }, p1/Z, [%x[in_ptr], #7, MUL VL]\n" |
| "ld1w { z3.s }, p1/Z, [x20, #-6, MUL VL]\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmin z22.s, p3/M, z22.s, z12.s\n" |
| "ld1w { z2.s }, p1/Z, [x20, #-3, MUL VL]\n" |
| "ld1w { z1.s }, p1/Z, [x20]\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "ld1w { z0.s }, p1/Z, [x20, #3, MUL VL]\n" |
| "ld1w { z31.s }, p1/Z, [x20, #6, MUL VL]\n" |
| "fmax z21.s, p3/M, z21.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| "fmax z18.s, p3/M, z18.s, z11.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "fmax z22.s, p3/M, z22.s, z11.s\n" |
| ".inst 0x658aafb4 // bfcvt z20.h, p3/M, z29.s\n" |
| ".inst 0x658aaf93 // bfcvt z19.h, p3/M, z28.s\n" |
| ".inst 0x658aaeb5 // bfcvt z21.h, p3/M, z21.s\n" |
| ".inst 0x658aaf30 // bfcvt z16.h, p3/M, z25.s\n" |
| "whilelt p0.s, x21, x11\n" |
| "decw x11, ALL, MUL #3\n" |
| ".inst 0x658aae52 // bfcvt z18.h, p3/M, z18.s\n" |
| ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" |
| "incw x21\n" |
| "st1h { z20.s }, p2, [x10]\n" |
| "st1h { z19.s }, p2, [x28]\n" |
| ".inst 0x658aaef4 // bfcvt z20.h, p3/M, z23.s\n" |
| ".inst 0x658aaed3 // bfcvt z19.h, p3/M, z22.s\n" |
| "st1h { z21.s }, p2, [x27]\n" |
| "ld1w { z30.s }, p0/Z, [%x[in_ptr], #2, MUL VL]\n" |
| "ld1w { z29.s }, p0/Z, [%x[in_ptr], #5, MUL VL]\n" |
| "cmp x11, XZR\n" |
| "st1h { z16.s }, p2, [x26]\n" |
| "ld1h { z16.s }, p1/Z, [x10, #1, MUL VL]\n" |
| "ld1w { z28.s }, p0/Z, [x20, #-8, MUL VL]\n" |
| "addvl %x[in_ptr], %x[in_ptr], #24\n" |
| "st1h { z18.s }, p2, [x25]\n" |
| "ld1h { z18.s }, p1/Z, [x28, #1, MUL VL]\n" |
| "ld1w { z27.s }, p0/Z, [x20, #-5, MUL VL]\n" |
| "st1h { z17.s }, p2, [x24]\n" |
| "ld1h { z17.s }, p1/Z, [x27, #1, MUL VL]\n" |
| "ld1w { z26.s }, p0/Z, [x20, #-2, MUL VL]\n" |
| "st1h { z20.s }, p2, [x23]\n" |
| "ld1h { z20.s }, p1/Z, [x26, #1, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "ld1w { z25.s }, p0/Z, [x20, #1, MUL VL]\n" |
| "st1h { z19.s }, p2, [x22]\n" |
| "ld1h { z19.s }, p1/Z, [x25, #1, MUL VL]\n" |
| "lsl z22.s, z18.s, #0x10\n" |
| "ld1w { z24.s }, p0/Z, [x20, #4, MUL VL]\n" |
| "ld1h { z18.s }, p1/Z, [x24, #1, MUL VL]\n" |
| "lsl z21.s, z17.s, #0x10\n" |
| "ld1w { z23.s }, p0/Z, [x20, #7, MUL VL]\n" |
| "ld1h { z17.s }, p1/Z, [x23, #1, MUL VL]\n" |
| "lsl z20.s, z20.s, #0x10\n" |
| "fadd z6.s, z6.s, z16.s\n" |
| "ld1h { z16.s }, p1/Z, [x22, #1, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "fadd z5.s, z5.s, z22.s\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z4.s, z4.s, z21.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fmin z6.s, p3/M, z6.s, z12.s\n" |
| "fadd z3.s, z3.s, z20.s\n" |
| "fadd z2.s, z2.s, z19.s\n" |
| "fmin z5.s, p3/M, z5.s, z12.s\n" |
| "fadd z1.s, z1.s, z18.s\n" |
| "fmin z4.s, p3/M, z4.s, z12.s\n" |
| "fadd z0.s, z0.s, z17.s\n" |
| "fadd z31.s, z31.s, z16.s\n" |
| "fmax z6.s, p3/M, z6.s, z11.s\n" |
| "fmin z3.s, p3/M, z3.s, z12.s\n" |
| "fmin z2.s, p3/M, z2.s, z12.s\n" |
| "fmax z5.s, p3/M, z5.s, z11.s\n" |
| "fmin z1.s, p3/M, z1.s, z12.s\n" |
| "fmin z0.s, p3/M, z0.s, z12.s\n" |
| "fmin z31.s, p3/M, z31.s, z12.s\n" |
| "fmax z4.s, p3/M, z4.s, z11.s\n" |
| ".inst 0x658aacd0 // bfcvt z16.h, p3/M, z6.s\n" |
| "fmax z3.s, p3/M, z3.s, z11.s\n" |
| "fmax z2.s, p3/M, z2.s, z11.s\n" |
| ".inst 0x658aacb1 // bfcvt z17.h, p3/M, z5.s\n" |
| "fmax z1.s, p3/M, z1.s, z11.s\n" |
| "fmax z0.s, p3/M, z0.s, z11.s\n" |
| "fmax z31.s, p3/M, z31.s, z11.s\n" |
| "st1h { z16.s }, p1, [x10, #1, MUL VL]\n" |
| ".inst 0x658aac90 // bfcvt z16.h, p3/M, z4.s\n" |
| "st1h { z17.s }, p1, [x28, #1, MUL VL]\n" |
| ".inst 0x658aac75 // bfcvt z21.h, p3/M, z3.s\n" |
| ".inst 0x658aac52 // bfcvt z18.h, p3/M, z2.s\n" |
| ".inst 0x658aac31 // bfcvt z17.h, p3/M, z1.s\n" |
| ".inst 0x658aac14 // bfcvt z20.h, p3/M, z0.s\n" |
| "st1h { z16.s }, p1, [x27, #1, MUL VL]\n" |
| ".inst 0x658aaff3 // bfcvt z19.h, p3/M, z31.s\n" |
| "ld1h { z16.s }, p0/Z, [x10, #2, MUL VL]\n" |
| "st1h { z21.s }, p1, [x26, #1, MUL VL]\n" |
| "st1h { z18.s }, p1, [x25, #1, MUL VL]\n" |
| "ld1h { z18.s }, p0/Z, [x28, #2, MUL VL]\n" |
| "st1h { z17.s }, p1, [x24, #1, MUL VL]\n" |
| "ld1h { z17.s }, p0/Z, [x27, #2, MUL VL]\n" |
| "st1h { z20.s }, p1, [x23, #1, MUL VL]\n" |
| "ld1h { z20.s }, p0/Z, [x26, #2, MUL VL]\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "st1h { z19.s }, p1, [x22, #1, MUL VL]\n" |
| "ld1h { z19.s }, p0/Z, [x25, #2, MUL VL]\n" |
| "lsl z22.s, z18.s, #0x10\n" |
| "ld1h { z18.s }, p0/Z, [x24, #2, MUL VL]\n" |
| "lsl z21.s, z17.s, #0x10\n" |
| "ld1h { z17.s }, p0/Z, [x23, #2, MUL VL]\n" |
| "lsl z20.s, z20.s, #0x10\n" |
| "fadd z30.s, z30.s, z16.s\n" |
| "ld1h { z16.s }, p0/Z, [x22, #2, MUL VL]\n" |
| "lsl z19.s, z19.s, #0x10\n" |
| "lsl z18.s, z18.s, #0x10\n" |
| "fadd z29.s, z29.s, z22.s\n" |
| "lsl z17.s, z17.s, #0x10\n" |
| "fadd z28.s, z28.s, z21.s\n" |
| "lsl z16.s, z16.s, #0x10\n" |
| "fadd z27.s, z27.s, z20.s\n" |
| "fmin z30.s, p3/M, z30.s, z12.s\n" |
| "fadd z26.s, z26.s, z19.s\n" |
| "fadd z25.s, z25.s, z18.s\n" |
| "fmin z29.s, p3/M, z29.s, z12.s\n" |
| "fadd z24.s, z24.s, z17.s\n" |
| "fmin z28.s, p3/M, z28.s, z12.s\n" |
| "fadd z23.s, z23.s, z16.s\n" |
| "fmin z27.s, p3/M, z27.s, z12.s\n" |
| "fmax z30.s, p3/M, z30.s, z11.s\n" |
| "fmin z26.s, p3/M, z26.s, z12.s\n" |
| "fmin z25.s, p3/M, z25.s, z12.s\n" |
| "fmax z29.s, p3/M, z29.s, z11.s\n" |
| "fmin z24.s, p3/M, z24.s, z12.s\n" |
| "fmin z23.s, p3/M, z23.s, z12.s\n" |
| "fmax z28.s, p3/M, z28.s, z11.s\n" |
| "fmax z27.s, p3/M, z27.s, z11.s\n" |
| ".inst 0x658aafd0 // bfcvt z16.h, p3/M, z30.s\n" |
| "fmax z26.s, p3/M, z26.s, z11.s\n" |
| "fmax z25.s, p3/M, z25.s, z11.s\n" |
| ".inst 0x658aafb1 // bfcvt z17.h, p3/M, z29.s\n" |
| "fmax z24.s, p3/M, z24.s, z11.s\n" |
| "fmax z23.s, p3/M, z23.s, z11.s\n" |
| "st1h { z16.s }, p0, [x10, #2, MUL VL]\n" |
| ".inst 0x658aaf90 // bfcvt z16.h, p3/M, z28.s\n" |
| ".inst 0x658aaf74 // bfcvt z20.h, p3/M, z27.s\n" |
| "inch x10, ALL, MUL #3\n" |
| "st1h { z17.s }, p0, [x28, #2, MUL VL]\n" |
| "inch x28, ALL, MUL #3\n" |
| ".inst 0x658aaf53 // bfcvt z19.h, p3/M, z26.s\n" |
| ".inst 0x658aaf32 // bfcvt z18.h, p3/M, z25.s\n" |
| "st1h { z16.s }, p0, [x27, #2, MUL VL]\n" |
| "inch x27, ALL, MUL #3\n" |
| ".inst 0x658aaf11 // bfcvt z17.h, p3/M, z24.s\n" |
| ".inst 0x658aaef0 // bfcvt z16.h, p3/M, z23.s\n" |
| "st1h { z20.s }, p0, [x26, #2, MUL VL]\n" |
| "inch x26, ALL, MUL #3\n" |
| "st1h { z19.s }, p0, [x25, #2, MUL VL]\n" |
| "inch x25, ALL, MUL #3\n" |
| "st1h { z18.s }, p0, [x24, #2, MUL VL]\n" |
| "inch x24, ALL, MUL #3\n" |
| "st1h { z17.s }, p0, [x23, #2, MUL VL]\n" |
| "inch x23, ALL, MUL #3\n" |
| "st1h { z16.s }, p0, [x22, #2, MUL VL]\n" |
| "inch x22, ALL, MUL #3\n" |
| "bgt 51b\n" |
| "subs %x[rows], %x[rows], #0x8\n" |
| "add %x[out_ptr], %x[out_ptr], x12\n" |
| "bgt 35b\n" |
| "52:" // Exit |
| : [in_ptr] "+&r" (in_ptr), [out_ptr] "+&r" (out_ptr), [rows] "+&r" (rows) |
| : [accumulate] "r" (accumulate), [bias] "r" (bias), [cols] "r" (cols), [ldout] "r" (ldout), [maxval] "r" (maxval), [minval] "r" (minval) |
| : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" |
| ); |
| } |
| |
| #endif // ARM_COMPUTE_ENABLE_SVE |