Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018 ARM Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp" |
| 25 | |
| 26 | namespace depthwise |
| 27 | { |
| 28 | using Conv = DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>; |
| 29 | using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 1, 1, float, float>; |
| 30 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 31 | #ifdef __aarch64__ |
| 32 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 33 | template <> |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 34 | template <> |
| 35 | void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>( |
| 36 | const int n_channels, |
| 37 | const float* const weights, |
| 38 | const int weight_row_stride, |
| 39 | const int weight_col_stride, |
| 40 | const float* const inptr, |
| 41 | const int in_row_stride, |
| 42 | const int in_col_stride, |
| 43 | float* const outptr, |
| 44 | const int out_row_stride, |
| 45 | const int out_col_stride, |
| 46 | const int, const int, const int, const int, const int, const int |
| 47 | ) |
| 48 | { |
| 49 | // Copy pointers |
| 50 | const float *uptr0 = inptr; |
| 51 | const float *wptr0 = weights; |
| 52 | float *vptr0 = outptr; |
| 53 | |
| 54 | int channels_remaining = n_channels; |
| 55 | if (channels_remaining >= 4) |
| 56 | { |
| 57 | // Process blocks of 4 channels at a time |
| 58 | int n_iters = ((channels_remaining / 4) + 1)/2 - 1; |
| 59 | const bool odd_tail = (channels_remaining / 4) & 1; |
| 60 | channels_remaining %= 4; |
| 61 | |
| 62 | asm volatile ( |
| 63 | "qU22B .req q0\n" "qU23B .req q0\n" "qW22A .req q0\n" |
| 64 | "vU22B .req v0\n" "vU23B .req v0\n" "vW22A .req v0\n" |
| 65 | "qV12A .req q1\n" "qW11B .req q1\n" |
| 66 | "vV12A .req v1\n" "vW11B .req v1\n" |
| 67 | "qU41A .req q2\n" "qU32B .req q2\n" "qU33A .req q2\n" "qV13B .req q2\n" |
| 68 | "vU41A .req v2\n" "vU32B .req v2\n" "vU33A .req v2\n" "vV13B .req v2\n" |
| 69 | "qU42B .req q3\n" "qU13B .req q3\n" "qU44B .req q3\n" "qU55A .req q3\n" |
| 70 | "vU42B .req v3\n" "vU13B .req v3\n" "vU44B .req v3\n" "vU55A .req v3\n" |
| 71 | "qU34B .req q4\n" "qU15A .req q4\n" "qU42A .req q4\n" "qU44A .req q4\n" "qU12B .req q4\n" |
| 72 | "vU34B .req v4\n" "vU15A .req v4\n" "vU42A .req v4\n" "vU44A .req v4\n" "vU12B .req v4\n" |
| 73 | "qU33B .req q5\n" "qU52A .req q5\n" "qW23A .req q5\n" |
| 74 | "vU33B .req v5\n" "vU52A .req v5\n" "vW23A .req v5\n" |
| 75 | "qV31A .req q6\n" "qU13A .req q6\n" "qV12B .req q6\n" |
| 76 | "vV31A .req v6\n" "vU13A .req v6\n" "vV12B .req v6\n" |
| 77 | "qU35B .req q7\n" "qU51B .req q7\n" "qV11A .req q7\n" "qU53B .req q7\n" |
| 78 | "vU35B .req v7\n" "vU51B .req v7\n" "vV11A .req v7\n" "vU53B .req v7\n" |
| 79 | "qW21A .req q8\n" "qV22B .req q8\n" |
| 80 | "vW21A .req v8\n" "vV22B .req v8\n" |
| 81 | "qV33B .req q9\n" "qU14A .req q9\n" "qV23A .req q9\n" "qU25B .req q9\n" |
| 82 | "vV33B .req v9\n" "vU14A .req v9\n" "vV23A .req v9\n" "vU25B .req v9\n" |
| 83 | "qW21B .req q10\n" "qV32A .req q10\n" "qU35A .req q10\n" |
| 84 | "vW21B .req v10\n" "vV32A .req v10\n" "vU35A .req v10\n" |
| 85 | "qV11B .req q11\n" "qU15B .req q11\n" "qV33A .req q11\n" |
| 86 | "vV11B .req v11\n" "vU15B .req v11\n" "vV33A .req v11\n" |
| 87 | "qU11B .req q12\n" "qW23B .req q12\n" "qU45A .req q12\n" |
| 88 | "vU11B .req v12\n" "vW23B .req v12\n" "vU45A .req v12\n" |
| 89 | "qW11A .req q13\n" "qU45B .req q13\n" "qU52B .req q13\n" |
| 90 | "vW11A .req v13\n" "vU45B .req v13\n" "vU52B .req v13\n" |
| 91 | "qU55B .req q14\n" "qU25A .req q14\n" "qV21A .req q14\n" |
| 92 | "vU55B .req v14\n" "vU25A .req v14\n" "vV21A .req v14\n" |
| 93 | "qU53A .req q15\n" "qV21B .req q15\n" "qU31A .req q15\n" |
| 94 | "vU53A .req v15\n" "vV21B .req v15\n" "vU31A .req v15\n" |
| 95 | "qW13B .req q16\n" "qU23A .req q16\n" |
| 96 | "vW13B .req v16\n" "vU23A .req v16\n" |
| 97 | "qW33B .req q17\n" "qW33A .req q17\n" |
| 98 | "vW33B .req v17\n" "vW33A .req v17\n" |
| 99 | "qU24B .req q18\n" "qU32A .req q18\n" "qV31B .req q18\n" "qV13A .req q18\n" |
| 100 | "vU24B .req v18\n" "vU32A .req v18\n" "vV31B .req v18\n" "vV13A .req v18\n" |
| 101 | "qU31B .req q19\n" "qU11A .req q19\n" "qU54B .req q19\n" "qU43A .req q19\n" |
| 102 | "vU31B .req v19\n" "vU11A .req v19\n" "vU54B .req v19\n" "vU43A .req v19\n" |
| 103 | "qU24A .req q20\n" "qW12B .req q20\n" "qU54A .req q20\n" |
| 104 | "vU24A .req v20\n" "vW12B .req v20\n" "vU54A .req v20\n" |
| 105 | "qV23B .req q21\n" "qW12A .req q21\n" |
| 106 | "vV23B .req v21\n" "vW12A .req v21\n" |
| 107 | "qW32A .req q22\n" "qU43B .req q22\n" |
| 108 | "vW32A .req v22\n" "vU43B .req v22\n" |
| 109 | "qW31A .req q23\n" "qV32B .req q23\n" |
| 110 | "vW31A .req v23\n" "vV32B .req v23\n" |
| 111 | "qU22A .req q24\n" "qW31B .req q24\n" |
| 112 | "vU22A .req v24\n" "vW31B .req v24\n" |
| 113 | "qU21B .req q25\n" "qV22A .req q25\n" |
| 114 | "vU21B .req v25\n" "vV22A .req v25\n" |
| 115 | "qU34A .req q26\n" "qW22B .req q26\n" "qU12A .req q26\n" |
| 116 | "vU34A .req v26\n" "vW22B .req v26\n" "vU12A .req v26\n" |
| 117 | "qW13A .req q27\n" "qU51A .req q27\n" |
| 118 | "vW13A .req v27\n" "vU51A .req v27\n" |
| 119 | "qW32B .req q28\n" |
| 120 | "vW32B .req v28\n" |
| 121 | "qU41B .req q29\n" "qU14B .req q29\n" |
| 122 | "vU41B .req v29\n" "vU14B .req v29\n" |
| 123 | "qU21A .req q30\n" |
| 124 | "vU21A .req v30\n" |
| 125 | |
| 126 | "uptr1 .req x0\n" |
| 127 | "uptr2 .req x1\n" |
| 128 | "uptr3 .req x2\n" |
| 129 | "uptr4 .req x3\n" |
| 130 | |
| 131 | "u_col_stride1 .req %x[u_col_stride]\n" |
| 132 | "u_col_stride2 .req x4\n" |
| 133 | "u_col_stride3 .req x5\n" |
| 134 | "u_col_stride4 .req x6\n" |
| 135 | |
| 136 | "wptr1 .req x7\n" |
| 137 | "wptr2 .req x8\n" |
| 138 | "w_col_stride1 .req %x[w_col_stride]\n" |
| 139 | "w_col_stride2 .req x9\n" |
| 140 | |
| 141 | "vptr1 .req x10\n" |
| 142 | "vptr2 .req x11\n" |
| 143 | "v_col_stride1 .req %x[v_col_stride]\n" |
| 144 | "v_col_stride2 .req x12\n" |
| 145 | |
| 146 | // Prepare strides and pointers |
| 147 | "add uptr1, %x[uptr0], %x[u_row_stride]\n" |
| 148 | "add uptr2, uptr1 , %x[u_row_stride]\n" |
| 149 | "add uptr3, uptr2 , %x[u_row_stride]\n" |
| 150 | "add uptr4, uptr3 , %x[u_row_stride]\n" |
| 151 | "add u_col_stride2, u_col_stride1, u_col_stride1\n" |
| 152 | "add u_col_stride3, u_col_stride2, u_col_stride1\n" |
| 153 | "add u_col_stride4, u_col_stride3, u_col_stride1\n" |
| 154 | |
| 155 | "add wptr1, %x[wptr0], %x[w_row_stride]\n" |
| 156 | "add wptr2, wptr1 , %x[w_row_stride]\n" |
| 157 | "add w_col_stride2, w_col_stride1, w_col_stride1\n" |
| 158 | |
| 159 | "add vptr1, %x[vptr0], %x[v_row_stride]\n" |
| 160 | "add vptr2, vptr1 , %x[v_row_stride]\n" |
| 161 | "add v_col_stride2, v_col_stride1, v_col_stride1\n" |
| 162 | |
| 163 | // Pre-load for A |
| 164 | "ldr qW13A, [%x[wptr0], w_col_stride2]\n" |
| 165 | "ldr qW23A, [wptr1, w_col_stride2]\n" |
| 166 | "ldr qW33A, [wptr2, w_col_stride2]\n" |
| 167 | "ldr qW12A, [%x[wptr0], w_col_stride1]\n" |
| 168 | "ldr qU15A, [%x[uptr0], u_col_stride4]\n" |
| 169 | "ldr qW22A, [wptr1, w_col_stride1]\n" |
| 170 | "ldr qU14A, [%x[uptr0], u_col_stride3]\n" |
| 171 | "ldr qW32A, [wptr2, w_col_stride1]\n" |
| 172 | "ldr qU13A, [%x[uptr0], u_col_stride2]\n" |
| 173 | "ldr qU25A, [uptr1, u_col_stride4]\n" |
| 174 | "ldr qU24A, [uptr1, u_col_stride3]\n" |
| 175 | "ldr qW11A, [%x[wptr0]], #0x10\n" |
| 176 | "ldr qU23A, [uptr1, u_col_stride2]\n" |
| 177 | "ldr qW21A, [wptr1], #0x10\n" |
| 178 | "ldr qW31A, [wptr2], #0x10\n" |
| 179 | "ldr qU34A, [uptr2, u_col_stride3]\n" |
| 180 | "ldr qU35A, [uptr2, u_col_stride4]\n" |
| 181 | |
| 182 | // First part of A |
| 183 | "fmul vV13A.4s, vU15A.4s, vW13A.4s\n" |
| 184 | "ldr qU33A, [uptr2, u_col_stride2]\n" |
| 185 | "fmul vV12A.4s, vU14A.4s, vW13A.4s\n" |
| 186 | "cbz %x[n_iters], 2f\n" // Jump to tail if not looping |
| 187 | |
| 188 | "1:" // Main loop, double unrolled |
| 189 | // A Part |
| 190 | "fmla vV13A.4s, vU14A.4s, vW12A.4s\n" |
| 191 | "ldr qU45A, [uptr3, u_col_stride4]\n" |
| 192 | "fmul vV11A.4s, vU13A.4s, vW13A.4s\n" |
| 193 | "fmla vV12A.4s, vU13A.4s, vW12A.4s\n" |
| 194 | "fmla vV13A.4s, vU13A.4s, vW11A.4s\n" |
| 195 | "ldr qU44A, [uptr3, u_col_stride3]\n" |
| 196 | "fmla vV13A.4s, vU25A.4s, vW23A.4s\n" |
| 197 | "fmul vV23A.4s, vU25A.4s, vW13A.4s\n" |
| 198 | "ldr qU43A, [uptr3, u_col_stride2]\n" |
| 199 | "fmla vV12A.4s, vU24A.4s, vW23A.4s\n" |
| 200 | "fmla vV13A.4s, vU24A.4s, vW22A.4s\n" |
| 201 | "fmul vV22A.4s, vU24A.4s, vW13A.4s\n" |
| 202 | "fmla vV23A.4s, vU24A.4s, vW12A.4s\n" |
| 203 | "ldr qU55A, [uptr4, u_col_stride4]\n" |
| 204 | "fmla vV11A.4s, vU23A.4s, vW23A.4s\n" |
| 205 | "fmla vV12A.4s, vU23A.4s, vW22A.4s\n" |
| 206 | "fmla vV13A.4s, vU23A.4s, vW21A.4s\n" |
| 207 | "fmul vV21A.4s, vU23A.4s, vW13A.4s\n" |
| 208 | "fmla vV22A.4s, vU23A.4s, vW12A.4s\n" |
| 209 | "fmla vV23A.4s, vU23A.4s, vW11A.4s\n" |
| 210 | "ldr qU54A, [uptr4, u_col_stride3]\n" |
| 211 | "fmla vV13A.4s, vU35A.4s, vW33A.4s\n" |
| 212 | "fmla vV23A.4s, vU35A.4s, vW23A.4s\n" |
| 213 | "fmul vV33A.4s, vU35A.4s, vW13A.4s\n" |
| 214 | "ldr qU53A, [uptr4, u_col_stride2]\n" |
| 215 | "fmla vV12A.4s, vU34A.4s, vW33A.4s\n" |
| 216 | "fmla vV13A.4s, vU34A.4s, vW32A.4s\n" |
| 217 | "fmla vV22A.4s, vU34A.4s, vW23A.4s\n" |
| 218 | "fmla vV23A.4s, vU34A.4s, vW22A.4s\n" |
| 219 | "fmul vV32A.4s, vU34A.4s, vW13A.4s\n" |
| 220 | "fmla vV33A.4s, vU34A.4s, vW12A.4s\n" |
| 221 | "ldr qU12A, [%x[uptr0], u_col_stride1]\n" |
| 222 | "fmla vV11A.4s, vU33A.4s, vW33A.4s\n" |
| 223 | "fmla vV12A.4s, vU33A.4s, vW32A.4s\n" |
| 224 | "fmla vV13A.4s, vU33A.4s, vW31A.4s\n" |
| 225 | "str qV13A, [%x[vptr0], v_col_stride2]\n" |
| 226 | "fmla vV21A.4s, vU33A.4s, vW23A.4s\n" |
| 227 | "fmla vV22A.4s, vU33A.4s, vW22A.4s\n" |
| 228 | "fmla vV23A.4s, vU33A.4s, vW21A.4s\n" |
| 229 | "fmul vV31A.4s, vU33A.4s, vW13A.4s\n" |
| 230 | "ldr qW13B, [%x[wptr0], w_col_stride2]\n" |
| 231 | "fmla vV32A.4s, vU33A.4s, vW12A.4s\n" |
| 232 | "fmla vV33A.4s, vU33A.4s, vW11A.4s\n" |
| 233 | "ldr qU22A, [uptr1, u_col_stride1]\n" |
| 234 | "fmla vV23A.4s, vU45A.4s, vW33A.4s\n" |
| 235 | "fmla vV33A.4s, vU45A.4s, vW23A.4s\n" |
| 236 | "ldr qU32A, [uptr2, u_col_stride1]\n" |
| 237 | "fmla vV22A.4s, vU44A.4s, vW33A.4s\n" |
| 238 | "fmla vV23A.4s, vU44A.4s, vW32A.4s\n" |
| 239 | "fmla vV32A.4s, vU44A.4s, vW23A.4s\n" |
| 240 | "fmla vV33A.4s, vU44A.4s, vW22A.4s\n" |
| 241 | "ldr qU42A, [uptr3, u_col_stride1]\n" |
| 242 | "fmla vV21A.4s, vU43A.4s, vW33A.4s\n" |
| 243 | "fmla vV22A.4s, vU43A.4s, vW32A.4s\n" |
| 244 | "fmla vV23A.4s, vU43A.4s, vW31A.4s\n" |
| 245 | "str qV23A, [vptr1, v_col_stride2]\n" |
| 246 | "fmla vV31A.4s, vU43A.4s, vW23A.4s\n" |
| 247 | "ldr qW23B, [wptr1, w_col_stride2]\n" |
| 248 | "fmla vV32A.4s, vU43A.4s, vW22A.4s\n" |
| 249 | "fmla vV33A.4s, vU43A.4s, vW21A.4s\n" |
| 250 | "ldr qU52A, [uptr4, u_col_stride1]\n" |
| 251 | "fmla vV33A.4s, vU55A.4s, vW33A.4s\n" |
| 252 | "ldr qU11A, [%x[uptr0]], #0x10\n" |
| 253 | "fmla vV32A.4s, vU54A.4s, vW33A.4s\n" |
| 254 | "fmla vV33A.4s, vU54A.4s, vW32A.4s\n" |
| 255 | "ldr qU21A, [uptr1], #0x10\n" |
| 256 | "fmla vV31A.4s, vU53A.4s, vW33A.4s\n" |
| 257 | "ldr qW33B, [wptr2, w_col_stride2]\n" |
| 258 | "fmla vV32A.4s, vU53A.4s, vW32A.4s\n" |
| 259 | "fmla vV33A.4s, vU53A.4s, vW31A.4s\n" |
| 260 | "str qV33A, [vptr2, v_col_stride2]\n" |
| 261 | "fmla vV11A.4s, vU12A.4s, vW12A.4s\n" |
| 262 | "ldr qU31A, [uptr2], #0x10\n" |
| 263 | "fmla vV12A.4s, vU12A.4s, vW11A.4s\n" |
| 264 | "ldr qU41A, [uptr3], #0x10\n" |
| 265 | "fmla vV11A.4s, vU22A.4s, vW22A.4s\n" |
| 266 | "ldr qU51A, [uptr4], #0x10\n" |
| 267 | "fmla vV12A.4s, vU22A.4s, vW21A.4s\n" |
| 268 | "ldr qW12B, [%x[wptr0], w_col_stride1]\n" |
| 269 | "fmla vV21A.4s, vU22A.4s, vW12A.4s\n" |
| 270 | "ldr qU15B, [%x[uptr0], u_col_stride4]\n" |
| 271 | "fmla vV22A.4s, vU22A.4s, vW11A.4s\n" |
| 272 | "ldr qW22B, [wptr1, w_col_stride1]\n" |
| 273 | "fmla vV11A.4s, vU32A.4s, vW32A.4s\n" |
| 274 | "ldr qU14B, [%x[uptr0], u_col_stride3]\n" |
| 275 | "fmla vV12A.4s, vU32A.4s, vW31A.4s\n" |
| 276 | "str qV12A, [%x[vptr0], v_col_stride1]\n" |
| 277 | "fmla vV21A.4s, vU32A.4s, vW22A.4s\n" |
| 278 | "ldr qW32B, [wptr2, w_col_stride1]\n" |
| 279 | "fmla vV22A.4s, vU32A.4s, vW21A.4s\n" |
| 280 | "ldr qU13B, [%x[uptr0], u_col_stride2]\n" |
| 281 | "fmla vV31A.4s, vU32A.4s, vW12A.4s\n" |
| 282 | "ldr qU25B, [uptr1, u_col_stride4]\n" |
| 283 | "fmla vV32A.4s, vU32A.4s, vW11A.4s\n" |
| 284 | "ldr qU24B, [uptr1, u_col_stride3]\n" |
| 285 | "fmla vV21A.4s, vU42A.4s, vW32A.4s\n" |
| 286 | "fmla vV22A.4s, vU42A.4s, vW31A.4s\n" |
| 287 | "str qV22A, [vptr1, v_col_stride1]\n" |
| 288 | "fmla vV31A.4s, vU42A.4s, vW22A.4s\n" |
| 289 | "fmla vV32A.4s, vU42A.4s, vW21A.4s\n" |
| 290 | "fmla vV31A.4s, vU52A.4s, vW32A.4s\n" |
| 291 | "fmla vV32A.4s, vU52A.4s, vW31A.4s\n" |
| 292 | "str qV32A, [vptr2, v_col_stride1]\n" |
| 293 | "fmla vV11A.4s, vU11A.4s, vW11A.4s\n" |
| 294 | "ldr qW11B, [%x[wptr0]], #0x10\n" |
| 295 | "fmla vV11A.4s, vU21A.4s, vW21A.4s\n" |
| 296 | "ldr qU23B, [uptr1, u_col_stride2]\n" |
| 297 | "fmla vV21A.4s, vU21A.4s, vW11A.4s\n" |
| 298 | "ldr qW21B, [wptr1], #0x10\n" |
| 299 | "fmla vV11A.4s, vU31A.4s, vW31A.4s\n" |
| 300 | "str qV11A, [%x[vptr0]], #0x10\n" |
| 301 | "fmla vV21A.4s, vU31A.4s, vW21A.4s\n" |
| 302 | "ldr qW31B, [wptr2], #0x10\n" |
| 303 | "fmla vV31A.4s, vU31A.4s, vW11A.4s\n" |
| 304 | "ldr qU34B, [uptr2, u_col_stride3]\n" |
| 305 | "fmla vV21A.4s, vU41A.4s, vW31A.4s\n" |
| 306 | "str qV21A, [vptr1], #0x10\n" |
| 307 | "fmla vV31A.4s, vU41A.4s, vW21A.4s\n" |
| 308 | "ldr qU35B, [uptr2, u_col_stride4]\n" |
| 309 | "fmla vV31A.4s, vU51A.4s, vW31A.4s\n" |
| 310 | "str qV31A, [vptr2], #0x10\n" |
| 311 | |
| 312 | // B Part |
| 313 | "fmul vV13B.4s, vU15B.4s, vW13B.4s\n" |
| 314 | "ldr qU33B, [uptr2, u_col_stride2]\n" |
| 315 | "fmul vV12B.4s, vU14B.4s, vW13B.4s\n" |
| 316 | "fmla vV13B.4s, vU14B.4s, vW12B.4s\n" |
| 317 | "ldr qU45B, [uptr3, u_col_stride4]\n" |
| 318 | "fmul vV11B.4s, vU13B.4s, vW13B.4s\n" |
| 319 | "fmla vV12B.4s, vU13B.4s, vW12B.4s\n" |
| 320 | "fmla vV13B.4s, vU13B.4s, vW11B.4s\n" |
| 321 | "ldr qU44B, [uptr3, u_col_stride3]\n" |
| 322 | "fmla vV13B.4s, vU25B.4s, vW23B.4s\n" |
| 323 | "fmul vV23B.4s, vU25B.4s, vW13B.4s\n" |
| 324 | "ldr qU43B, [uptr3, u_col_stride2]\n" |
| 325 | "fmla vV12B.4s, vU24B.4s, vW23B.4s\n" |
| 326 | "fmla vV13B.4s, vU24B.4s, vW22B.4s\n" |
| 327 | "fmul vV22B.4s, vU24B.4s, vW13B.4s\n" |
| 328 | "fmla vV23B.4s, vU24B.4s, vW12B.4s\n" |
| 329 | "ldr qU55B, [uptr4, u_col_stride4]\n" |
| 330 | "fmla vV11B.4s, vU23B.4s, vW23B.4s\n" |
| 331 | "fmla vV12B.4s, vU23B.4s, vW22B.4s\n" |
| 332 | "fmla vV13B.4s, vU23B.4s, vW21B.4s\n" |
| 333 | "fmul vV21B.4s, vU23B.4s, vW13B.4s\n" |
| 334 | "fmla vV22B.4s, vU23B.4s, vW12B.4s\n" |
| 335 | "fmla vV23B.4s, vU23B.4s, vW11B.4s\n" |
| 336 | "ldr qU54B, [uptr4, u_col_stride3]\n" |
| 337 | "fmla vV13B.4s, vU35B.4s, vW33B.4s\n" |
| 338 | "fmla vV23B.4s, vU35B.4s, vW23B.4s\n" |
| 339 | "fmul vV33B.4s, vU35B.4s, vW13B.4s\n" |
| 340 | "ldr qU53B, [uptr4, u_col_stride2]\n" |
| 341 | "fmla vV12B.4s, vU34B.4s, vW33B.4s\n" |
| 342 | "fmla vV13B.4s, vU34B.4s, vW32B.4s\n" |
| 343 | "fmla vV22B.4s, vU34B.4s, vW23B.4s\n" |
| 344 | "fmla vV23B.4s, vU34B.4s, vW22B.4s\n" |
| 345 | "fmul vV32B.4s, vU34B.4s, vW13B.4s\n" |
| 346 | "fmla vV33B.4s, vU34B.4s, vW12B.4s\n" |
| 347 | "ldr qU12B, [%x[uptr0], u_col_stride1]\n" |
| 348 | "fmla vV11B.4s, vU33B.4s, vW33B.4s\n" |
| 349 | "fmla vV12B.4s, vU33B.4s, vW32B.4s\n" |
| 350 | "fmla vV13B.4s, vU33B.4s, vW31B.4s\n" |
| 351 | "str qV13B, [%x[vptr0], v_col_stride2]\n" |
| 352 | "fmla vV21B.4s, vU33B.4s, vW23B.4s\n" |
| 353 | "fmla vV22B.4s, vU33B.4s, vW22B.4s\n" |
| 354 | "fmla vV23B.4s, vU33B.4s, vW21B.4s\n" |
| 355 | "fmul vV31B.4s, vU33B.4s, vW13B.4s\n" |
| 356 | "ldr qW13A, [%x[wptr0], w_col_stride2]\n" |
| 357 | "fmla vV32B.4s, vU33B.4s, vW12B.4s\n" |
| 358 | "fmla vV33B.4s, vU33B.4s, vW11B.4s\n" |
| 359 | "ldr qU22B, [uptr1, u_col_stride1]\n" |
| 360 | "fmla vV23B.4s, vU45B.4s, vW33B.4s\n" |
| 361 | "fmla vV33B.4s, vU45B.4s, vW23B.4s\n" |
| 362 | "ldr qU32B, [uptr2, u_col_stride1]\n" |
| 363 | "fmla vV22B.4s, vU44B.4s, vW33B.4s\n" |
| 364 | "fmla vV23B.4s, vU44B.4s, vW32B.4s\n" |
| 365 | "fmla vV32B.4s, vU44B.4s, vW23B.4s\n" |
| 366 | "fmla vV33B.4s, vU44B.4s, vW22B.4s\n" |
| 367 | "ldr qU42B, [uptr3, u_col_stride1]\n" |
| 368 | "fmla vV21B.4s, vU43B.4s, vW33B.4s\n" |
| 369 | "fmla vV22B.4s, vU43B.4s, vW32B.4s\n" |
| 370 | "fmla vV23B.4s, vU43B.4s, vW31B.4s\n" |
| 371 | "str qV23B, [vptr1, v_col_stride2]\n" |
| 372 | "fmla vV31B.4s, vU43B.4s, vW23B.4s\n" |
| 373 | "ldr qW23A, [wptr1, w_col_stride2]\n" |
| 374 | "fmla vV32B.4s, vU43B.4s, vW22B.4s\n" |
| 375 | "fmla vV33B.4s, vU43B.4s, vW21B.4s\n" |
| 376 | "ldr qU52B, [uptr4, u_col_stride1]\n" |
| 377 | "fmla vV33B.4s, vU55B.4s, vW33B.4s\n" |
| 378 | "ldr qU11B, [%x[uptr0]], #0x10\n" |
| 379 | "fmla vV32B.4s, vU54B.4s, vW33B.4s\n" |
| 380 | "fmla vV33B.4s, vU54B.4s, vW32B.4s\n" |
| 381 | "ldr qU21B, [uptr1], #0x10\n" |
| 382 | "fmla vV31B.4s, vU53B.4s, vW33B.4s\n" |
| 383 | "ldr qW33A, [wptr2, w_col_stride2]\n" |
| 384 | "fmla vV32B.4s, vU53B.4s, vW32B.4s\n" |
| 385 | "fmla vV33B.4s, vU53B.4s, vW31B.4s\n" |
| 386 | "str qV33B, [vptr2, v_col_stride2]\n" |
| 387 | "fmla vV11B.4s, vU12B.4s, vW12B.4s\n" |
| 388 | "ldr qU31B, [uptr2], #0x10\n" |
| 389 | "fmla vV12B.4s, vU12B.4s, vW11B.4s\n" |
| 390 | "ldr qU41B, [uptr3], #0x10\n" |
| 391 | "fmla vV11B.4s, vU22B.4s, vW22B.4s\n" |
| 392 | "ldr qU51B, [uptr4], #0x10\n" |
| 393 | "fmla vV12B.4s, vU22B.4s, vW21B.4s\n" |
| 394 | "ldr qW12A, [%x[wptr0], w_col_stride1]\n" |
| 395 | "fmla vV21B.4s, vU22B.4s, vW12B.4s\n" |
| 396 | "ldr qU15A, [%x[uptr0], u_col_stride4]\n" |
| 397 | "fmla vV22B.4s, vU22B.4s, vW11B.4s\n" |
| 398 | "ldr qW22A, [wptr1, w_col_stride1]\n" |
| 399 | "fmla vV11B.4s, vU32B.4s, vW32B.4s\n" |
| 400 | "ldr qU14A, [%x[uptr0], u_col_stride3]\n" |
| 401 | "fmla vV12B.4s, vU32B.4s, vW31B.4s\n" |
| 402 | "str qV12B, [%x[vptr0], v_col_stride1]\n" |
| 403 | "fmla vV21B.4s, vU32B.4s, vW22B.4s\n" |
| 404 | "ldr qW32A, [wptr2, w_col_stride1]\n" |
| 405 | "fmla vV22B.4s, vU32B.4s, vW21B.4s\n" |
| 406 | "ldr qU13A, [%x[uptr0], u_col_stride2]\n" |
| 407 | "fmla vV31B.4s, vU32B.4s, vW12B.4s\n" |
| 408 | "ldr qU25A, [uptr1, u_col_stride4]\n" |
| 409 | "fmla vV32B.4s, vU32B.4s, vW11B.4s\n" |
| 410 | "ldr qU24A, [uptr1, u_col_stride3]\n" |
| 411 | "fmla vV21B.4s, vU42B.4s, vW32B.4s\n" |
| 412 | "fmla vV22B.4s, vU42B.4s, vW31B.4s\n" |
| 413 | "str qV22B, [vptr1, v_col_stride1]\n" |
| 414 | "fmla vV31B.4s, vU42B.4s, vW22B.4s\n" |
| 415 | "fmla vV32B.4s, vU42B.4s, vW21B.4s\n" |
| 416 | "fmla vV31B.4s, vU52B.4s, vW32B.4s\n" |
| 417 | "subs %x[n_iters], %x[n_iters], #1\n" |
| 418 | "fmla vV32B.4s, vU52B.4s, vW31B.4s\n" |
| 419 | "str qV32B, [vptr2, v_col_stride1]\n" |
| 420 | "fmla vV11B.4s, vU11B.4s, vW11B.4s\n" |
| 421 | "ldr qW11A, [%x[wptr0]], #0x10\n" |
| 422 | "fmla vV11B.4s, vU21B.4s, vW21B.4s\n" |
| 423 | "ldr qU23A, [uptr1, u_col_stride2]\n" |
| 424 | "fmla vV21B.4s, vU21B.4s, vW11B.4s\n" |
| 425 | "ldr qW21A, [wptr1], #0x10\n" |
| 426 | "fmla vV11B.4s, vU31B.4s, vW31B.4s\n" |
| 427 | "str qV11B, [%x[vptr0]], #0x10\n" |
| 428 | "fmla vV21B.4s, vU31B.4s, vW21B.4s\n" |
| 429 | "ldr qW31A, [wptr2], #0x10\n" |
| 430 | "fmla vV31B.4s, vU31B.4s, vW11B.4s\n" |
| 431 | "ldr qU34A, [uptr2, u_col_stride3]\n" |
| 432 | "fmla vV21B.4s, vU41B.4s, vW31B.4s\n" |
| 433 | "str qV21B, [vptr1], #0x10\n" |
| 434 | "fmla vV31B.4s, vU41B.4s, vW21B.4s\n" |
| 435 | "ldr qU35A, [uptr2, u_col_stride4]\n" |
| 436 | "fmla vV31B.4s, vU51B.4s, vW31B.4s\n" |
| 437 | "str qV31B, [vptr2], #0x10\n" |
| 438 | |
| 439 | // First part of A |
| 440 | "fmul vV13A.4s, vU15A.4s, vW13A.4s\n" |
| 441 | "ldr qU33A, [uptr2, u_col_stride2]\n" |
| 442 | "fmul vV12A.4s, vU14A.4s, vW13A.4s\n" |
| 443 | "bne 1b\n" // Loop |
| 444 | |
| 445 | "2:" // Tail dispatch |
| 446 | "cbnz %w[odd_tail], 3f\n" |
| 447 | |
| 448 | // Even tail |
| 449 | // A Part |
| 450 | "fmla vV13A.4s, vU14A.4s, vW12A.4s\n" |
| 451 | "ldr qU45A, [uptr3, u_col_stride4]\n" |
| 452 | "fmul vV11A.4s, vU13A.4s, vW13A.4s\n" |
| 453 | "fmla vV12A.4s, vU13A.4s, vW12A.4s\n" |
| 454 | "fmla vV13A.4s, vU13A.4s, vW11A.4s\n" |
| 455 | "ldr qU44A, [uptr3, u_col_stride3]\n" |
| 456 | "fmla vV13A.4s, vU25A.4s, vW23A.4s\n" |
| 457 | "fmul vV23A.4s, vU25A.4s, vW13A.4s\n" |
| 458 | "ldr qU43A, [uptr3, u_col_stride2]\n" |
| 459 | "fmla vV12A.4s, vU24A.4s, vW23A.4s\n" |
| 460 | "fmla vV13A.4s, vU24A.4s, vW22A.4s\n" |
| 461 | "fmul vV22A.4s, vU24A.4s, vW13A.4s\n" |
| 462 | "fmla vV23A.4s, vU24A.4s, vW12A.4s\n" |
| 463 | "ldr qU55A, [uptr4, u_col_stride4]\n" |
| 464 | "fmla vV11A.4s, vU23A.4s, vW23A.4s\n" |
| 465 | "fmla vV12A.4s, vU23A.4s, vW22A.4s\n" |
| 466 | "fmla vV13A.4s, vU23A.4s, vW21A.4s\n" |
| 467 | "fmul vV21A.4s, vU23A.4s, vW13A.4s\n" |
| 468 | "fmla vV22A.4s, vU23A.4s, vW12A.4s\n" |
| 469 | "fmla vV23A.4s, vU23A.4s, vW11A.4s\n" |
| 470 | "ldr qU54A, [uptr4, u_col_stride3]\n" |
| 471 | "fmla vV13A.4s, vU35A.4s, vW33A.4s\n" |
| 472 | "fmla vV23A.4s, vU35A.4s, vW23A.4s\n" |
| 473 | "fmul vV33A.4s, vU35A.4s, vW13A.4s\n" |
| 474 | "ldr qU53A, [uptr4, u_col_stride2]\n" |
| 475 | "fmla vV12A.4s, vU34A.4s, vW33A.4s\n" |
| 476 | "fmla vV13A.4s, vU34A.4s, vW32A.4s\n" |
| 477 | "fmla vV22A.4s, vU34A.4s, vW23A.4s\n" |
| 478 | "fmla vV23A.4s, vU34A.4s, vW22A.4s\n" |
| 479 | "fmul vV32A.4s, vU34A.4s, vW13A.4s\n" |
| 480 | "fmla vV33A.4s, vU34A.4s, vW12A.4s\n" |
| 481 | "ldr qU12A, [%x[uptr0], u_col_stride1]\n" |
| 482 | "fmla vV11A.4s, vU33A.4s, vW33A.4s\n" |
| 483 | "fmla vV12A.4s, vU33A.4s, vW32A.4s\n" |
| 484 | "fmla vV13A.4s, vU33A.4s, vW31A.4s\n" |
| 485 | "str qV13A, [%x[vptr0], v_col_stride2]\n" |
| 486 | "fmla vV21A.4s, vU33A.4s, vW23A.4s\n" |
| 487 | "fmla vV22A.4s, vU33A.4s, vW22A.4s\n" |
| 488 | "fmla vV23A.4s, vU33A.4s, vW21A.4s\n" |
| 489 | "fmul vV31A.4s, vU33A.4s, vW13A.4s\n" |
| 490 | "ldr qW13B, [%x[wptr0], w_col_stride2]\n" |
| 491 | "fmla vV32A.4s, vU33A.4s, vW12A.4s\n" |
| 492 | "fmla vV33A.4s, vU33A.4s, vW11A.4s\n" |
| 493 | "ldr qU22A, [uptr1, u_col_stride1]\n" |
| 494 | "fmla vV23A.4s, vU45A.4s, vW33A.4s\n" |
| 495 | "fmla vV33A.4s, vU45A.4s, vW23A.4s\n" |
| 496 | "ldr qU32A, [uptr2, u_col_stride1]\n" |
| 497 | "fmla vV22A.4s, vU44A.4s, vW33A.4s\n" |
| 498 | "fmla vV23A.4s, vU44A.4s, vW32A.4s\n" |
| 499 | "fmla vV32A.4s, vU44A.4s, vW23A.4s\n" |
| 500 | "fmla vV33A.4s, vU44A.4s, vW22A.4s\n" |
| 501 | "ldr qU42A, [uptr3, u_col_stride1]\n" |
| 502 | "fmla vV21A.4s, vU43A.4s, vW33A.4s\n" |
| 503 | "fmla vV22A.4s, vU43A.4s, vW32A.4s\n" |
| 504 | "fmla vV23A.4s, vU43A.4s, vW31A.4s\n" |
| 505 | "str qV23A, [vptr1, v_col_stride2]\n" |
| 506 | "fmla vV31A.4s, vU43A.4s, vW23A.4s\n" |
| 507 | "ldr qW23B, [wptr1, w_col_stride2]\n" |
| 508 | "fmla vV32A.4s, vU43A.4s, vW22A.4s\n" |
| 509 | "fmla vV33A.4s, vU43A.4s, vW21A.4s\n" |
| 510 | "ldr qU52A, [uptr4, u_col_stride1]\n" |
| 511 | "fmla vV33A.4s, vU55A.4s, vW33A.4s\n" |
| 512 | "ldr qU11A, [%x[uptr0]], #0x10\n" |
| 513 | "fmla vV32A.4s, vU54A.4s, vW33A.4s\n" |
| 514 | "fmla vV33A.4s, vU54A.4s, vW32A.4s\n" |
| 515 | "ldr qU21A, [uptr1], #0x10\n" |
| 516 | "fmla vV31A.4s, vU53A.4s, vW33A.4s\n" |
| 517 | "ldr qW33B, [wptr2, w_col_stride2]\n" |
| 518 | "fmla vV32A.4s, vU53A.4s, vW32A.4s\n" |
| 519 | "fmla vV33A.4s, vU53A.4s, vW31A.4s\n" |
| 520 | "str qV33A, [vptr2, v_col_stride2]\n" |
| 521 | "fmla vV11A.4s, vU12A.4s, vW12A.4s\n" |
| 522 | "ldr qU31A, [uptr2], #0x10\n" |
| 523 | "fmla vV12A.4s, vU12A.4s, vW11A.4s\n" |
| 524 | "ldr qU41A, [uptr3], #0x10\n" |
| 525 | "fmla vV11A.4s, vU22A.4s, vW22A.4s\n" |
| 526 | "ldr qU51A, [uptr4], #0x10\n" |
| 527 | "fmla vV12A.4s, vU22A.4s, vW21A.4s\n" |
| 528 | "ldr qW12B, [%x[wptr0], w_col_stride1]\n" |
| 529 | "fmla vV21A.4s, vU22A.4s, vW12A.4s\n" |
| 530 | "ldr qU15B, [%x[uptr0], u_col_stride4]\n" |
| 531 | "fmla vV22A.4s, vU22A.4s, vW11A.4s\n" |
| 532 | "ldr qW22B, [wptr1, w_col_stride1]\n" |
| 533 | "fmla vV11A.4s, vU32A.4s, vW32A.4s\n" |
| 534 | "ldr qU14B, [%x[uptr0], u_col_stride3]\n" |
| 535 | "fmla vV12A.4s, vU32A.4s, vW31A.4s\n" |
| 536 | "str qV12A, [%x[vptr0], v_col_stride1]\n" |
| 537 | "fmla vV21A.4s, vU32A.4s, vW22A.4s\n" |
| 538 | "ldr qW32B, [wptr2, w_col_stride1]\n" |
| 539 | "fmla vV22A.4s, vU32A.4s, vW21A.4s\n" |
| 540 | "ldr qU13B, [%x[uptr0], u_col_stride2]\n" |
| 541 | "fmla vV31A.4s, vU32A.4s, vW12A.4s\n" |
| 542 | "ldr qU25B, [uptr1, u_col_stride4]\n" |
| 543 | "fmla vV32A.4s, vU32A.4s, vW11A.4s\n" |
| 544 | "ldr qU24B, [uptr1, u_col_stride3]\n" |
| 545 | "fmla vV21A.4s, vU42A.4s, vW32A.4s\n" |
| 546 | "fmla vV22A.4s, vU42A.4s, vW31A.4s\n" |
| 547 | "str qV22A, [vptr1, v_col_stride1]\n" |
| 548 | "fmla vV31A.4s, vU42A.4s, vW22A.4s\n" |
| 549 | "fmla vV32A.4s, vU42A.4s, vW21A.4s\n" |
| 550 | "fmla vV31A.4s, vU52A.4s, vW32A.4s\n" |
| 551 | "fmla vV32A.4s, vU52A.4s, vW31A.4s\n" |
| 552 | "str qV32A, [vptr2, v_col_stride1]\n" |
| 553 | "fmla vV11A.4s, vU11A.4s, vW11A.4s\n" |
| 554 | "ldr qW11B, [%x[wptr0]], #0x10\n" |
| 555 | "fmla vV11A.4s, vU21A.4s, vW21A.4s\n" |
| 556 | "ldr qU23B, [uptr1, u_col_stride2]\n" |
| 557 | "fmla vV21A.4s, vU21A.4s, vW11A.4s\n" |
| 558 | "ldr qW21B, [wptr1], #0x10\n" |
| 559 | "fmla vV11A.4s, vU31A.4s, vW31A.4s\n" |
| 560 | "str qV11A, [%x[vptr0]], #0x10\n" |
| 561 | "fmla vV21A.4s, vU31A.4s, vW21A.4s\n" |
| 562 | "ldr qW31B, [wptr2], #0x10\n" |
| 563 | "fmla vV31A.4s, vU31A.4s, vW11A.4s\n" |
| 564 | "ldr qU34B, [uptr2, u_col_stride3]\n" |
| 565 | "fmla vV21A.4s, vU41A.4s, vW31A.4s\n" |
| 566 | "str qV21A, [vptr1], #0x10\n" |
| 567 | "fmla vV31A.4s, vU41A.4s, vW21A.4s\n" |
| 568 | "ldr qU35B, [uptr2, u_col_stride4]\n" |
| 569 | "fmla vV31A.4s, vU51A.4s, vW31A.4s\n" |
| 570 | "str qV31A, [vptr2], #0x10\n" |
| 571 | |
| 572 | // B Part |
| 573 | "fmul vV13B.4s, vU15B.4s, vW13B.4s\n" |
| 574 | "ldr qU33B, [uptr2, u_col_stride2]\n" |
| 575 | "fmul vV12B.4s, vU14B.4s, vW13B.4s\n" |
| 576 | "fmla vV13B.4s, vU14B.4s, vW12B.4s\n" |
| 577 | "ldr qU45B, [uptr3, u_col_stride4]\n" |
| 578 | "fmul vV11B.4s, vU13B.4s, vW13B.4s\n" |
| 579 | "fmla vV12B.4s, vU13B.4s, vW12B.4s\n" |
| 580 | "fmla vV13B.4s, vU13B.4s, vW11B.4s\n" |
| 581 | "ldr qU44B, [uptr3, u_col_stride3]\n" |
| 582 | "fmla vV13B.4s, vU25B.4s, vW23B.4s\n" |
| 583 | "fmul vV23B.4s, vU25B.4s, vW13B.4s\n" |
| 584 | "ldr qU43B, [uptr3, u_col_stride2]\n" |
| 585 | "fmla vV12B.4s, vU24B.4s, vW23B.4s\n" |
| 586 | "fmla vV13B.4s, vU24B.4s, vW22B.4s\n" |
| 587 | "fmul vV22B.4s, vU24B.4s, vW13B.4s\n" |
| 588 | "fmla vV23B.4s, vU24B.4s, vW12B.4s\n" |
| 589 | "ldr qU55B, [uptr4, u_col_stride4]\n" |
| 590 | "fmla vV11B.4s, vU23B.4s, vW23B.4s\n" |
| 591 | "fmla vV12B.4s, vU23B.4s, vW22B.4s\n" |
| 592 | "fmla vV13B.4s, vU23B.4s, vW21B.4s\n" |
| 593 | "fmul vV21B.4s, vU23B.4s, vW13B.4s\n" |
| 594 | "fmla vV22B.4s, vU23B.4s, vW12B.4s\n" |
| 595 | "fmla vV23B.4s, vU23B.4s, vW11B.4s\n" |
| 596 | "ldr qU54B, [uptr4, u_col_stride3]\n" |
| 597 | "fmla vV13B.4s, vU35B.4s, vW33B.4s\n" |
| 598 | "fmla vV23B.4s, vU35B.4s, vW23B.4s\n" |
| 599 | "fmul vV33B.4s, vU35B.4s, vW13B.4s\n" |
| 600 | "ldr qU53B, [uptr4, u_col_stride2]\n" |
| 601 | "fmla vV12B.4s, vU34B.4s, vW33B.4s\n" |
| 602 | "fmla vV13B.4s, vU34B.4s, vW32B.4s\n" |
| 603 | "fmla vV22B.4s, vU34B.4s, vW23B.4s\n" |
| 604 | "fmla vV23B.4s, vU34B.4s, vW22B.4s\n" |
| 605 | "fmul vV32B.4s, vU34B.4s, vW13B.4s\n" |
| 606 | "fmla vV33B.4s, vU34B.4s, vW12B.4s\n" |
| 607 | "ldr qU12B, [%x[uptr0], u_col_stride1]\n" |
| 608 | "fmla vV11B.4s, vU33B.4s, vW33B.4s\n" |
| 609 | "fmla vV12B.4s, vU33B.4s, vW32B.4s\n" |
| 610 | "fmla vV13B.4s, vU33B.4s, vW31B.4s\n" |
| 611 | "str qV13B, [%x[vptr0], v_col_stride2]\n" |
| 612 | "fmla vV21B.4s, vU33B.4s, vW23B.4s\n" |
| 613 | "fmla vV22B.4s, vU33B.4s, vW22B.4s\n" |
| 614 | "fmla vV23B.4s, vU33B.4s, vW21B.4s\n" |
| 615 | "fmul vV31B.4s, vU33B.4s, vW13B.4s\n" |
| 616 | "fmla vV32B.4s, vU33B.4s, vW12B.4s\n" |
| 617 | "fmla vV33B.4s, vU33B.4s, vW11B.4s\n" |
| 618 | "ldr qU22B, [uptr1, u_col_stride1]\n" |
| 619 | "fmla vV23B.4s, vU45B.4s, vW33B.4s\n" |
| 620 | "fmla vV33B.4s, vU45B.4s, vW23B.4s\n" |
| 621 | "ldr qU32B, [uptr2, u_col_stride1]\n" |
| 622 | "fmla vV22B.4s, vU44B.4s, vW33B.4s\n" |
| 623 | "fmla vV23B.4s, vU44B.4s, vW32B.4s\n" |
| 624 | "fmla vV32B.4s, vU44B.4s, vW23B.4s\n" |
| 625 | "fmla vV33B.4s, vU44B.4s, vW22B.4s\n" |
| 626 | "ldr qU42B, [uptr3, u_col_stride1]\n" |
| 627 | "fmla vV21B.4s, vU43B.4s, vW33B.4s\n" |
| 628 | "fmla vV22B.4s, vU43B.4s, vW32B.4s\n" |
| 629 | "fmla vV23B.4s, vU43B.4s, vW31B.4s\n" |
| 630 | "str qV23B, [vptr1, v_col_stride2]\n" |
| 631 | "fmla vV31B.4s, vU43B.4s, vW23B.4s\n" |
| 632 | "fmla vV32B.4s, vU43B.4s, vW22B.4s\n" |
| 633 | "fmla vV33B.4s, vU43B.4s, vW21B.4s\n" |
| 634 | "ldr qU52B, [uptr4, u_col_stride1]\n" |
| 635 | "fmla vV33B.4s, vU55B.4s, vW33B.4s\n" |
| 636 | "ldr qU11B, [%x[uptr0]], #0x10\n" |
| 637 | "fmla vV32B.4s, vU54B.4s, vW33B.4s\n" |
| 638 | "fmla vV33B.4s, vU54B.4s, vW32B.4s\n" |
| 639 | "ldr qU21B, [uptr1], #0x10\n" |
| 640 | "fmla vV31B.4s, vU53B.4s, vW33B.4s\n" |
| 641 | "fmla vV32B.4s, vU53B.4s, vW32B.4s\n" |
| 642 | "fmla vV33B.4s, vU53B.4s, vW31B.4s\n" |
| 643 | "str qV33B, [vptr2, v_col_stride2]\n" |
| 644 | "fmla vV11B.4s, vU12B.4s, vW12B.4s\n" |
| 645 | "ldr qU31B, [uptr2], #0x10\n" |
| 646 | "fmla vV12B.4s, vU12B.4s, vW11B.4s\n" |
| 647 | "ldr qU41B, [uptr3], #0x10\n" |
| 648 | "fmla vV11B.4s, vU22B.4s, vW22B.4s\n" |
| 649 | "ldr qU51B, [uptr4], #0x10\n" |
| 650 | "fmla vV12B.4s, vU22B.4s, vW21B.4s\n" |
| 651 | "fmla vV21B.4s, vU22B.4s, vW12B.4s\n" |
| 652 | "fmla vV22B.4s, vU22B.4s, vW11B.4s\n" |
| 653 | "fmla vV11B.4s, vU32B.4s, vW32B.4s\n" |
| 654 | "fmla vV12B.4s, vU32B.4s, vW31B.4s\n" |
| 655 | "str qV12B, [%x[vptr0], v_col_stride1]\n" |
| 656 | "fmla vV21B.4s, vU32B.4s, vW22B.4s\n" |
| 657 | "fmla vV22B.4s, vU32B.4s, vW21B.4s\n" |
| 658 | "fmla vV31B.4s, vU32B.4s, vW12B.4s\n" |
| 659 | "fmla vV32B.4s, vU32B.4s, vW11B.4s\n" |
| 660 | "fmla vV21B.4s, vU42B.4s, vW32B.4s\n" |
| 661 | "fmla vV22B.4s, vU42B.4s, vW31B.4s\n" |
| 662 | "str qV22B, [vptr1, v_col_stride1]\n" |
| 663 | "fmla vV31B.4s, vU42B.4s, vW22B.4s\n" |
| 664 | "fmla vV32B.4s, vU42B.4s, vW21B.4s\n" |
| 665 | "fmla vV31B.4s, vU52B.4s, vW32B.4s\n" |
| 666 | "subs %x[n_iters], %x[n_iters], #1\n" |
| 667 | "fmla vV32B.4s, vU52B.4s, vW31B.4s\n" |
| 668 | "str qV32B, [vptr2, v_col_stride1]\n" |
| 669 | "fmla vV11B.4s, vU11B.4s, vW11B.4s\n" |
| 670 | "fmla vV11B.4s, vU21B.4s, vW21B.4s\n" |
| 671 | "fmla vV21B.4s, vU21B.4s, vW11B.4s\n" |
| 672 | "fmla vV11B.4s, vU31B.4s, vW31B.4s\n" |
| 673 | "str qV11B, [%x[vptr0]], #0x10\n" |
| 674 | "fmla vV21B.4s, vU31B.4s, vW21B.4s\n" |
| 675 | "fmla vV31B.4s, vU31B.4s, vW11B.4s\n" |
| 676 | "fmla vV21B.4s, vU41B.4s, vW31B.4s\n" |
| 677 | "str qV21B, [vptr1], #0x10\n" |
| 678 | "fmla vV31B.4s, vU41B.4s, vW21B.4s\n" |
| 679 | "fmla vV31B.4s, vU51B.4s, vW31B.4s\n" |
| 680 | "str qV31B, [vptr2], #0x10\n" |
| 681 | |
| 682 | "b 4f\n" // Branch to end of method |
| 683 | |
| 684 | "3:" // Odd tail, finish off A |
| 685 | "fmla vV13A.4s, vU14A.4s, vW12A.4s\n" |
| 686 | "ldr qU45A, [uptr3, u_col_stride4]\n" |
| 687 | "fmul vV11A.4s, vU13A.4s, vW13A.4s\n" |
| 688 | "fmla vV12A.4s, vU13A.4s, vW12A.4s\n" |
| 689 | "fmla vV13A.4s, vU13A.4s, vW11A.4s\n" |
| 690 | "ldr qU44A, [uptr3, u_col_stride3]\n" |
| 691 | "fmla vV13A.4s, vU25A.4s, vW23A.4s\n" |
| 692 | "fmul vV23A.4s, vU25A.4s, vW13A.4s\n" |
| 693 | "ldr qU43A, [uptr3, u_col_stride2]\n" |
| 694 | "fmla vV12A.4s, vU24A.4s, vW23A.4s\n" |
| 695 | "fmla vV13A.4s, vU24A.4s, vW22A.4s\n" |
| 696 | "fmul vV22A.4s, vU24A.4s, vW13A.4s\n" |
| 697 | "fmla vV23A.4s, vU24A.4s, vW12A.4s\n" |
| 698 | "ldr qU55A, [uptr4, u_col_stride4]\n" |
| 699 | "fmla vV11A.4s, vU23A.4s, vW23A.4s\n" |
| 700 | "fmla vV12A.4s, vU23A.4s, vW22A.4s\n" |
| 701 | "fmla vV13A.4s, vU23A.4s, vW21A.4s\n" |
| 702 | "fmul vV21A.4s, vU23A.4s, vW13A.4s\n" |
| 703 | "fmla vV22A.4s, vU23A.4s, vW12A.4s\n" |
| 704 | "fmla vV23A.4s, vU23A.4s, vW11A.4s\n" |
| 705 | "ldr qU54A, [uptr4, u_col_stride3]\n" |
| 706 | "fmla vV13A.4s, vU35A.4s, vW33A.4s\n" |
| 707 | "fmla vV23A.4s, vU35A.4s, vW23A.4s\n" |
| 708 | "fmul vV33A.4s, vU35A.4s, vW13A.4s\n" |
| 709 | "ldr qU53A, [uptr4, u_col_stride2]\n" |
| 710 | "fmla vV12A.4s, vU34A.4s, vW33A.4s\n" |
| 711 | "fmla vV13A.4s, vU34A.4s, vW32A.4s\n" |
| 712 | "fmla vV22A.4s, vU34A.4s, vW23A.4s\n" |
| 713 | "fmla vV23A.4s, vU34A.4s, vW22A.4s\n" |
| 714 | "fmul vV32A.4s, vU34A.4s, vW13A.4s\n" |
| 715 | "fmla vV33A.4s, vU34A.4s, vW12A.4s\n" |
| 716 | "ldr qU12A, [%x[uptr0], u_col_stride1]\n" |
| 717 | "fmla vV11A.4s, vU33A.4s, vW33A.4s\n" |
| 718 | "fmla vV12A.4s, vU33A.4s, vW32A.4s\n" |
| 719 | "fmla vV13A.4s, vU33A.4s, vW31A.4s\n" |
| 720 | "str qV13A, [%x[vptr0], v_col_stride2]\n" |
| 721 | "fmla vV21A.4s, vU33A.4s, vW23A.4s\n" |
| 722 | "fmla vV22A.4s, vU33A.4s, vW22A.4s\n" |
| 723 | "fmla vV23A.4s, vU33A.4s, vW21A.4s\n" |
| 724 | "fmul vV31A.4s, vU33A.4s, vW13A.4s\n" |
| 725 | "fmla vV32A.4s, vU33A.4s, vW12A.4s\n" |
| 726 | "fmla vV33A.4s, vU33A.4s, vW11A.4s\n" |
| 727 | "ldr qU22A, [uptr1, u_col_stride1]\n" |
| 728 | "fmla vV23A.4s, vU45A.4s, vW33A.4s\n" |
| 729 | "fmla vV33A.4s, vU45A.4s, vW23A.4s\n" |
| 730 | "ldr qU32A, [uptr2, u_col_stride1]\n" |
| 731 | "fmla vV22A.4s, vU44A.4s, vW33A.4s\n" |
| 732 | "fmla vV23A.4s, vU44A.4s, vW32A.4s\n" |
| 733 | "fmla vV32A.4s, vU44A.4s, vW23A.4s\n" |
| 734 | "fmla vV33A.4s, vU44A.4s, vW22A.4s\n" |
| 735 | "ldr qU42A, [uptr3, u_col_stride1]\n" |
| 736 | "fmla vV21A.4s, vU43A.4s, vW33A.4s\n" |
| 737 | "fmla vV22A.4s, vU43A.4s, vW32A.4s\n" |
| 738 | "fmla vV23A.4s, vU43A.4s, vW31A.4s\n" |
| 739 | "str qV23A, [vptr1, v_col_stride2]\n" |
| 740 | "fmla vV31A.4s, vU43A.4s, vW23A.4s\n" |
| 741 | "fmla vV32A.4s, vU43A.4s, vW22A.4s\n" |
| 742 | "fmla vV33A.4s, vU43A.4s, vW21A.4s\n" |
| 743 | "ldr qU52A, [uptr4, u_col_stride1]\n" |
| 744 | "fmla vV33A.4s, vU55A.4s, vW33A.4s\n" |
| 745 | "ldr qU11A, [%x[uptr0]], #0x10\n" |
| 746 | "fmla vV32A.4s, vU54A.4s, vW33A.4s\n" |
| 747 | "fmla vV33A.4s, vU54A.4s, vW32A.4s\n" |
| 748 | "ldr qU21A, [uptr1], #0x10\n" |
| 749 | "fmla vV31A.4s, vU53A.4s, vW33A.4s\n" |
| 750 | "fmla vV32A.4s, vU53A.4s, vW32A.4s\n" |
| 751 | "fmla vV33A.4s, vU53A.4s, vW31A.4s\n" |
| 752 | "str qV33A, [vptr2, v_col_stride2]\n" |
| 753 | "fmla vV11A.4s, vU12A.4s, vW12A.4s\n" |
| 754 | "ldr qU31A, [uptr2], #0x10\n" |
| 755 | "fmla vV12A.4s, vU12A.4s, vW11A.4s\n" |
| 756 | "ldr qU41A, [uptr3], #0x10\n" |
| 757 | "fmla vV11A.4s, vU22A.4s, vW22A.4s\n" |
| 758 | "ldr qU51A, [uptr4], #0x10\n" |
| 759 | "fmla vV12A.4s, vU22A.4s, vW21A.4s\n" |
| 760 | "fmla vV21A.4s, vU22A.4s, vW12A.4s\n" |
| 761 | "fmla vV22A.4s, vU22A.4s, vW11A.4s\n" |
| 762 | "fmla vV11A.4s, vU32A.4s, vW32A.4s\n" |
| 763 | "fmla vV12A.4s, vU32A.4s, vW31A.4s\n" |
| 764 | "str qV12A, [%x[vptr0], v_col_stride1]\n" |
| 765 | "fmla vV21A.4s, vU32A.4s, vW22A.4s\n" |
| 766 | "fmla vV22A.4s, vU32A.4s, vW21A.4s\n" |
| 767 | "fmla vV31A.4s, vU32A.4s, vW12A.4s\n" |
| 768 | "fmla vV32A.4s, vU32A.4s, vW11A.4s\n" |
| 769 | "fmla vV21A.4s, vU42A.4s, vW32A.4s\n" |
| 770 | "fmla vV22A.4s, vU42A.4s, vW31A.4s\n" |
| 771 | "str qV22A, [vptr1, v_col_stride1]\n" |
| 772 | "fmla vV31A.4s, vU42A.4s, vW22A.4s\n" |
| 773 | "fmla vV32A.4s, vU42A.4s, vW21A.4s\n" |
| 774 | "fmla vV31A.4s, vU52A.4s, vW32A.4s\n" |
| 775 | "fmla vV32A.4s, vU52A.4s, vW31A.4s\n" |
| 776 | "str qV32A, [vptr2, v_col_stride1]\n" |
| 777 | "fmla vV11A.4s, vU11A.4s, vW11A.4s\n" |
| 778 | "fmla vV11A.4s, vU21A.4s, vW21A.4s\n" |
| 779 | "fmla vV21A.4s, vU21A.4s, vW11A.4s\n" |
| 780 | "fmla vV11A.4s, vU31A.4s, vW31A.4s\n" |
| 781 | "str qV11A, [%x[vptr0]], #0x10\n" |
| 782 | "fmla vV21A.4s, vU31A.4s, vW21A.4s\n" |
| 783 | "fmla vV31A.4s, vU31A.4s, vW11A.4s\n" |
| 784 | "fmla vV21A.4s, vU41A.4s, vW31A.4s\n" |
| 785 | "str qV21A, [vptr1], #0x10\n" |
| 786 | "fmla vV31A.4s, vU41A.4s, vW21A.4s\n" |
| 787 | "fmla vV31A.4s, vU51A.4s, vW31A.4s\n" |
| 788 | "str qV31A, [vptr2], #0x10\n" |
| 789 | |
| 790 | "4:" // End of method |
| 791 | ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n" |
| 792 | ".unreq u_col_stride1\n" ".unreq u_col_stride2\n" |
| 793 | ".unreq u_col_stride3\n" ".unreq u_col_stride4\n" |
| 794 | ".unreq wptr1\n" ".unreq wptr2\n" |
| 795 | ".unreq w_col_stride1\n" ".unreq w_col_stride2\n" |
| 796 | ".unreq vptr1\n" ".unreq vptr2\n" |
| 797 | ".unreq v_col_stride1\n" ".unreq v_col_stride2\n" |
| 798 | |
| 799 | ".unreq qU22B\n" ".unreq qW13B\n" ".unreq qW13A\n" ".unreq qU51B\n" |
| 800 | ".unreq qU54B\n" ".unreq qU45A\n" ".unreq qU15A\n" ".unreq qU41B\n" |
| 801 | ".unreq qU24B\n" ".unreq qU21A\n" |
| 802 | ".unreq qV11B\n" ".unreq qU51A\n" ".unreq qU35A\n" ".unreq qU12A\n" |
| 803 | ".unreq qU42B\n" ".unreq qU44B\n" ".unreq qU13B\n" ".unreq qW33A\n" |
| 804 | ".unreq qV31B\n" ".unreq qV23A\n" ".unreq qU31A\n" ".unreq qU35B\n" ".unreq qU13A\n" |
| 805 | ".unreq qV23B\n" ".unreq qU11A\n" ".unreq qU25A\n" ".unreq qU43A\n" ".unreq qU52B\n" |
| 806 | ".unreq qU24A\n" ".unreq qU23B\n" ".unreq qV21A\n" ".unreq qV32B\n" |
| 807 | ".unreq qV33B\n" ".unreq qW11A\n" ".unreq qU31B\n" |
| 808 | ".unreq qW12B\n" ".unreq qU33A\n" ".unreq qU14A\n" ".unreq qU22A\n" |
| 809 | ".unreq qU25B\n" ".unreq qU53B\n" ".unreq qU42A\n" ".unreq qU44A\n" |
| 810 | ".unreq qU43B\n" ".unreq qW31A\n" ".unreq qU11B\n" |
| 811 | ".unreq qW11B\n" ".unreq qW32A\n" |
| 812 | ".unreq qU12B\n" ".unreq qU34B\n" ".unreq qW21A\n" |
| 813 | ".unreq qU14B\n" ".unreq qV21B\n" ".unreq qW22A\n" |
| 814 | ".unreq qW23B\n" ".unreq qW23A\n" ".unreq qU21B\n" |
| 815 | ".unreq qU32B\n" ".unreq qU34A\n" ".unreq qU45B\n" ".unreq qV31A\n" |
| 816 | ".unreq qW12A\n" ".unreq qU33B\n" ".unreq qU15B\n" |
| 817 | ".unreq qW33B\n" ".unreq qU54A\n" ".unreq qU23A\n" |
| 818 | ".unreq qW32B\n" ".unreq qV33A\n" ".unreq qW31B\n" ".unreq qV12A\n" |
| 819 | ".unreq qV12B\n" ".unreq qU41A\n" ".unreq qU53A\n" |
| 820 | ".unreq qV13A\n" ".unreq qU32A\n" ".unreq qW22B\n" |
| 821 | ".unreq qV22B\n" ".unreq qU52A\n" ".unreq qV13B\n" ".unreq qV32A\n" |
| 822 | ".unreq qU55A\n" ".unreq qU55B\n" ".unreq qV22A\n" ".unreq qW21B\n" |
| 823 | ".unreq qV11A\n" |
| 824 | ".unreq vU22B\n" ".unreq vW13B\n" ".unreq vW13A\n" ".unreq vU51B\n" |
| 825 | ".unreq vU54B\n" ".unreq vU45A\n" ".unreq vU15A\n" ".unreq vU41B\n" |
| 826 | ".unreq vU24B\n" ".unreq vU21A\n" |
| 827 | ".unreq vV11B\n" ".unreq vU51A\n" ".unreq vU35A\n" ".unreq vU12A\n" |
| 828 | ".unreq vU42B\n" ".unreq vU44B\n" ".unreq vU13B\n" ".unreq vW33A\n" |
| 829 | ".unreq vV31B\n" ".unreq vV23A\n" ".unreq vU31A\n" ".unreq vU35B\n" ".unreq vU13A\n" |
| 830 | ".unreq vV23B\n" ".unreq vU11A\n" ".unreq vU25A\n" ".unreq vU43A\n" ".unreq vU52B\n" |
| 831 | ".unreq vU24A\n" ".unreq vU23B\n" ".unreq vV21A\n" ".unreq vV32B\n" |
| 832 | ".unreq vV33B\n" ".unreq vW11A\n" ".unreq vU31B\n" |
| 833 | ".unreq vW12B\n" ".unreq vU33A\n" ".unreq vU14A\n" ".unreq vU22A\n" |
| 834 | ".unreq vU25B\n" ".unreq vU53B\n" ".unreq vU42A\n" ".unreq vU44A\n" |
| 835 | ".unreq vU43B\n" ".unreq vW31A\n" ".unreq vU11B\n" |
| 836 | ".unreq vW11B\n" ".unreq vW32A\n" |
| 837 | ".unreq vU12B\n" ".unreq vU34B\n" ".unreq vW21A\n" |
| 838 | ".unreq vU14B\n" ".unreq vV21B\n" ".unreq vW22A\n" |
| 839 | ".unreq vW23B\n" ".unreq vW23A\n" ".unreq vU21B\n" |
| 840 | ".unreq vU32B\n" ".unreq vU34A\n" ".unreq vU45B\n" ".unreq vV31A\n" |
| 841 | ".unreq vW12A\n" ".unreq vU33B\n" ".unreq vU15B\n" |
| 842 | ".unreq vW33B\n" ".unreq vU54A\n" ".unreq vU23A\n" |
| 843 | ".unreq vW32B\n" ".unreq vV33A\n" ".unreq vW31B\n" ".unreq vV12A\n" |
| 844 | ".unreq vV12B\n" ".unreq vU41A\n" ".unreq vU53A\n" |
| 845 | ".unreq vV13A\n" ".unreq vU32A\n" ".unreq vW22B\n" |
| 846 | ".unreq vV22B\n" ".unreq vU52A\n" ".unreq vV13B\n" ".unreq vV32A\n" |
| 847 | ".unreq vU55A\n" ".unreq vU55B\n" ".unreq vV22A\n" ".unreq vW21B\n" |
| 848 | ".unreq vV11A\n" |
| 849 | : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0), |
| 850 | [n_iters] "+r" (n_iters) |
| 851 | : [u_row_stride] "r" (in_row_stride * sizeof(float)), |
| 852 | [u_col_stride] "r" (in_col_stride * sizeof(float)), |
| 853 | [w_row_stride] "r" (weight_row_stride * sizeof(float)), |
| 854 | [w_col_stride] "r" (weight_col_stride * sizeof(float)), |
| 855 | [v_row_stride] "r" (out_row_stride * sizeof(float)), |
| 856 | [v_col_stride] "r" (out_col_stride * sizeof(float)), |
| 857 | [odd_tail] "r" (odd_tail) |
| 858 | : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", |
| 859 | "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", |
| 860 | "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", |
| 861 | "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", |
| 862 | "x12", "cc", "memory" |
| 863 | ); |
| 864 | } |
| 865 | if (channels_remaining) |
| 866 | { |
| 867 | // Fall back on the unoptimised version to clean up the tail |
| 868 | ConvImpl::process_tile<false>( |
| 869 | channels_remaining, |
| 870 | wptr0, weight_row_stride, weight_col_stride, |
| 871 | uptr0, in_row_stride, in_col_stride, |
| 872 | vptr0, out_row_stride, out_col_stride, |
| 873 | 0, 0, 0, 0, 0, 0 |
| 874 | ); |
| 875 | } |
| 876 | } |
| 877 | |
| 878 | #endif // __aarch64__ |
| 879 | |
| 880 | template <> |
| 881 | const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>; |
| 882 | |
| 883 | template <> |
| 884 | const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = { |
| 885 | ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 886 | }; |
| 887 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame^] | 888 | template <> |
| 889 | const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = { |
| 890 | ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>, |
| 891 | }; |
| 892 | |
| 893 | template <> |
| 894 | const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = { |
| 895 | { |
| 896 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>, |
| 897 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>, |
| 898 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>, |
| 899 | }, |
| 900 | { |
| 901 | ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>, |
| 902 | ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>, |
| 903 | ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>, |
| 904 | }, |
| 905 | { |
| 906 | ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>, |
| 907 | ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>, |
| 908 | ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>, |
| 909 | }, |
| 910 | { |
| 911 | ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>, |
| 912 | ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>, |
| 913 | ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>, |
| 914 | }, |
| 915 | { |
| 916 | ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>, |
| 917 | ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>, |
| 918 | ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>, |
| 919 | }, |
| 920 | }; |
| 921 | |
| 922 | template <> |
| 923 | const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = { |
| 924 | { |
| 925 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>, |
| 926 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>, |
| 927 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>, |
| 928 | }, |
| 929 | { |
| 930 | ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>, |
| 931 | ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>, |
| 932 | ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>, |
| 933 | }, |
| 934 | { |
| 935 | ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>, |
| 936 | ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>, |
| 937 | ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>, |
| 938 | }, |
| 939 | { |
| 940 | ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>, |
| 941 | ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>, |
| 942 | ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>, |
| 943 | }, |
| 944 | { |
| 945 | ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>, |
| 946 | ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>, |
| 947 | ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>, |
| 948 | }, |
| 949 | }; |
| 950 | |
| 951 | template <> |
| 952 | const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 953 | |
| 954 | template class DepthwiseConvolution<3, 3, 3, 3, 1, 1, float, float>; |
| 955 | } // namespace depthwise |