Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2018 ARM Limited. |
| 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
Georgios Pinitas | 20c246a | 2018-09-12 16:45:53 +0100 | [diff] [blame^] | 24 | #include "impl_fp32_fp32.hpp" |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 25 | |
| 26 | namespace depthwise |
| 27 | { |
| 28 | using Conv = DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>; |
| 29 | using ConvImpl = DepthwiseConvolutionImpl<3, 3, 3, 3, 2, 2, float, float>; |
| 30 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 31 | #ifdef __aarch64__ |
| 32 | |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 33 | template <> |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 34 | template <> |
| 35 | void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>( |
| 36 | const int n_channels, |
| 37 | const float* const weights, |
| 38 | const int weight_row_stride, |
| 39 | const int weight_col_stride, |
| 40 | const float* const inptr, |
| 41 | const int in_row_stride, |
| 42 | const int in_col_stride, |
| 43 | float* const outptr, |
| 44 | const int out_row_stride, |
| 45 | const int out_col_stride, |
| 46 | const int, const int, const int, const int, const int, const int |
| 47 | ) |
| 48 | { |
| 49 | // Copy pointers |
| 50 | const float *uptr0 = inptr; |
| 51 | const float *wptr0 = weights; |
| 52 | float *vptr0 = outptr; |
| 53 | |
| 54 | int channels_remaining = n_channels; |
| 55 | if (channels_remaining >= 4) |
| 56 | { |
| 57 | // Process blocks of 4 channels at a time |
| 58 | int n_iters = channels_remaining / 4 - 1; |
| 59 | channels_remaining %= 4; |
| 60 | |
| 61 | asm volatile( |
| 62 | // Prepare aliases |
| 63 | "qW13 .req q0\n" "vW13 .req v0\n" |
| 64 | "qU15 .req q1\n" "qU73 .req q1\n" "qU45 .req q1\n" "qU14 .req q1\n" |
| 65 | "vU15 .req v1\n" "vU73 .req v1\n" "vU45 .req v1\n" "vU14 .req v1\n" |
| 66 | "qU62 .req q2\n" "qV12 .req q2\n" "vU62 .req v2\n" "vV12 .req v2\n" |
| 67 | "qU51 .req q3\n" "qU43 .req q3\n" "qU55 .req q3\n" |
| 68 | "vU51 .req v3\n" "vU43 .req v3\n" "vU55 .req v3\n" |
| 69 | "qU77 .req q4\n" "qV13 .req q4\n" "qV31 .req q4\n" "qU44 .req q4\n" |
| 70 | "vU77 .req v4\n" "vV13 .req v4\n" "vV31 .req v4\n" "vU44 .req v4\n" |
| 71 | "qV33 .req q5\n" "qU46 .req q5\n" "qU11 .req q5\n" "qU37 .req q5\n" |
| 72 | "vV33 .req v5\n" "vU46 .req v5\n" "vU11 .req v5\n" "vU37 .req v5\n" |
| 73 | "qU56 .req q6\n" "qU25 .req q6\n" "qU32 .req q6\n" |
| 74 | "vU56 .req v6\n" "vU25 .req v6\n" "vU32 .req v6\n" |
| 75 | "qU72 .req q7\n" "qV22 .req q7\n" "vU72 .req v7\n" "vV22 .req v7\n" |
| 76 | "qU67 .req q8\n" "qU61 .req q8\n" "qU13 .req q8\n" |
| 77 | "vU67 .req v8\n" "vU61 .req v8\n" "vU13 .req v8\n" |
| 78 | "qU74 .req q9\n" "qU34 .req q9\n" "qU17 .req q9\n" "qU66 .req q9\n" |
| 79 | "vU74 .req v9\n" "vU34 .req v9\n" "vU17 .req v9\n" "vU66 .req v9\n" |
| 80 | "qU33 .req q10\n" "qU57 .req q10\n" "qU21 .req q10\n" |
| 81 | "vU33 .req v10\n" "vU57 .req v10\n" "vU21 .req v10\n" "qW23 .req q11\n" |
| 82 | "vW23 .req v11\n" "qU42 .req q12\n" "qV23 .req q12\n" "qU23 .req q12\n" |
| 83 | "vU42 .req v12\n" "vV23 .req v12\n" "vU23 .req v12\n" |
| 84 | "qW33 .req q13\n" "vW33 .req v13\n" |
| 85 | "qU76 .req q14\n" "qU47 .req q14\n" "qU64 .req q14\n" "qU41 .req q14\n" |
| 86 | "vU76 .req v14\n" "vU47 .req v14\n" "vU64 .req v14\n" "vU41 .req v14\n" |
| 87 | "qU52 .req q15\n" "qU54 .req q15\n" "qU75 .req q15\n" "qU26 .req q15\n" |
| 88 | "vU52 .req v15\n" "vU54 .req v15\n" "vU75 .req v15\n" "vU26 .req v15\n" |
| 89 | "qU53 .req q16\n" "qU27 .req q16\n" "vU53 .req v16\n" "vU27 .req v16\n" |
| 90 | "qV21 .req q17\n" "qU65 .req q17\n" "vV21 .req v17\n" "vU65 .req v17\n" |
| 91 | "qU31 .req q18\n" "qU24 .req q18\n" "qU36 .req q18\n" |
| 92 | "vU31 .req v18\n" "vU24 .req v18\n" "vU36 .req v18\n" "qU22 .req q19\n" |
| 93 | "vU22 .req v19\n" "qU35 .req q20\n" "qU63 .req q20\n" |
| 94 | "vU35 .req v20\n" "vU63 .req v20\n" "qW12 .req q21\n" |
| 95 | "vW12 .req v21\n" "qV32 .req q22\n" "qU16 .req q22\n" |
| 96 | "vV32 .req v22\n" "vU16 .req v22\n" "qW11 .req q23\n" "vW11 .req v23\n" |
| 97 | "qU12 .req q24\n" "vU12 .req v24\n" "qW31 .req q25\n" "vW31 .req v25\n" |
| 98 | "qW22 .req q26\n" "vW22 .req v26\n" "qU71 .req q27\n" "vU71 .req v27\n" |
| 99 | "qV11 .req q28\n" "vV11 .req v28\n" "qW21 .req q29\n" "vW21 .req v29\n" |
| 100 | "qW32 .req q30\n" "vW32 .req v30\n" |
| 101 | |
| 102 | "uptr1 .req x0\n" |
| 103 | "uptr2 .req x1\n" |
| 104 | "uptr3 .req x2\n" |
| 105 | "uptr4 .req x3\n" |
| 106 | "uptr5 .req x4\n" |
| 107 | "uptr6 .req x5\n" |
| 108 | "u_col_stride1 .req %x[u_col_stride]\n" |
| 109 | "u_col_stride2 .req x6\n" |
| 110 | "u_col_stride3 .req x7\n" |
| 111 | "u_col_stride4 .req x8\n" |
| 112 | "u_col_stride5 .req x9\n" |
| 113 | "u_col_stride6 .req x10\n" |
| 114 | "wptr1 .req x11\n" |
| 115 | "wptr2 .req x12\n" |
| 116 | "w_col_stride1 .req %x[w_col_stride]\n" |
| 117 | "w_col_stride2 .req x13\n" |
| 118 | "vptr1 .req x14\n" |
| 119 | "vptr2 .req x15\n" |
| 120 | "v_col_stride1 .req %x[v_col_stride]\n" |
| 121 | "v_col_stride2 .req x16\n" |
| 122 | |
| 123 | // Prepare strides and pointers |
| 124 | "add uptr1, %x[uptr0], %x[u_row_stride]\n" |
| 125 | "add uptr2, uptr1 , %x[u_row_stride]\n" |
| 126 | "add uptr3, uptr2 , %x[u_row_stride]\n" |
| 127 | "add uptr4, uptr3 , %x[u_row_stride]\n" |
| 128 | "add uptr5, uptr4 , %x[u_row_stride]\n" |
| 129 | "add uptr6, uptr5 , %x[u_row_stride]\n" |
| 130 | "add u_col_stride2, u_col_stride1, u_col_stride1\n" |
| 131 | "add u_col_stride3, u_col_stride2, u_col_stride1\n" |
| 132 | "add u_col_stride4, u_col_stride3, u_col_stride1\n" |
| 133 | "add u_col_stride5, u_col_stride4, u_col_stride1\n" |
| 134 | "add u_col_stride6, u_col_stride5, u_col_stride1\n" |
| 135 | |
| 136 | "add wptr1, %x[wptr0], %x[w_row_stride]\n" |
| 137 | "add wptr2, wptr1 , %x[w_row_stride]\n" |
| 138 | "add w_col_stride2, w_col_stride1, w_col_stride1\n" |
| 139 | |
| 140 | "add vptr1, %x[vptr0], %x[v_row_stride]\n" |
| 141 | "add vptr2, vptr1 , %x[v_row_stride]\n" |
| 142 | "add v_col_stride2, v_col_stride1, v_col_stride1\n" |
| 143 | |
| 144 | // Prepare for first iteration |
| 145 | "ldr qW13, [%x[wptr0], w_col_stride2]\n" |
| 146 | "ldr qW23, [wptr1, w_col_stride2]\n" |
| 147 | "ldr qW33, [wptr2, w_col_stride2]\n" |
| 148 | "ldr qW12, [%x[wptr0], w_col_stride1]\n" |
| 149 | "ldr qW22, [wptr1, w_col_stride1]\n" |
| 150 | "ldr qW32, [wptr2, w_col_stride1]\n" |
| 151 | "ldr qW11, [%x[wptr0]], #0x10\n" |
| 152 | "ldr qW21, [wptr1], #0x10\n" |
| 153 | "ldr qU17, [%x[uptr0], u_col_stride6]\n" |
| 154 | "ldr qU15, [%x[uptr0], u_col_stride4]\n" |
| 155 | "ldr qU16, [%x[uptr0], u_col_stride5]\n" |
| 156 | "ldr qU37, [uptr2, u_col_stride6]\n" |
| 157 | "ldr qU35, [uptr2, u_col_stride4]\n" |
| 158 | "ldr qU36, [uptr2, u_col_stride5]\n" |
| 159 | "ldr qU27, [uptr1, u_col_stride6]\n" |
| 160 | "ldr qU25, [uptr1, u_col_stride4]\n" |
| 161 | "fmul vV13.4s, vU17.4s, vW13.4s\n" |
| 162 | "fmul vV12.4s, vU15.4s, vW13.4s\n" |
| 163 | "fmla vV13.4s, vU15.4s, vW11.4s\n" |
| 164 | "ldr qW31, [wptr2], #0x10\n" |
| 165 | "fmla vV13.4s, vU16.4s, vW12.4s\n" |
| 166 | "ldr qU26, [uptr1, u_col_stride5]\n" |
| 167 | "fmla vV13.4s, vU37.4s, vW33.4s\n" |
| 168 | "ldr qU47, [uptr3, u_col_stride6]\n" |
| 169 | "fmul vV23.4s, vU37.4s, vW13.4s\n" |
| 170 | "ldr qU45, [uptr3, u_col_stride4]\n" |
| 171 | "fmla vV12.4s, vU35.4s, vW33.4s\n" |
| 172 | "ldr qU46, [uptr3, u_col_stride5]\n" |
| 173 | "fmla vV13.4s, vU35.4s, vW31.4s\n" |
| 174 | "ldr qU67, [uptr5, u_col_stride6]\n" |
| 175 | "fmul vV22.4s, vU35.4s, vW13.4s\n" |
| 176 | "cbz %x[n_iters], 2f\n" // Jump to tail if no iterations |
| 177 | |
| 178 | "1:" // Loop body |
| 179 | "fmla vV23.4s, vU35.4s, vW11.4s\n" |
| 180 | "ldr qU65, [uptr5, u_col_stride4]\n" |
| 181 | "fmla vV13.4s, vU36.4s, vW32.4s\n" |
| 182 | "fmla vV23.4s, vU36.4s, vW12.4s\n" |
| 183 | "ldr qU66, [uptr5, u_col_stride5]\n" |
| 184 | "fmla vV13.4s, vU27.4s, vW23.4s\n" |
| 185 | "ldr qU57, [uptr4, u_col_stride6]\n" |
| 186 | "fmla vV12.4s, vU25.4s, vW23.4s\n" |
| 187 | "ldr qU55, [uptr4, u_col_stride4]\n" |
| 188 | "fmla vV13.4s, vU25.4s, vW21.4s\n" |
| 189 | "ldr qU56, [uptr4, u_col_stride5]\n" |
| 190 | "fmla vV13.4s, vU26.4s, vW22.4s\n" |
| 191 | "str qV13, [%x[vptr0], v_col_stride2]\n" |
| 192 | "fmla vV23.4s, vU47.4s, vW23.4s\n" |
| 193 | "ldr qU77, [uptr6, u_col_stride6]\n" |
| 194 | "fmla vV22.4s, vU45.4s, vW23.4s\n" |
| 195 | "fmla vV23.4s, vU45.4s, vW21.4s\n" |
| 196 | "ldr qU75, [uptr6, u_col_stride4]\n" |
| 197 | "fmla vV23.4s, vU46.4s, vW22.4s\n" |
| 198 | "ldr qU76, [uptr6, u_col_stride5]\n" |
| 199 | "fmul vV33.4s, vU67.4s, vW23.4s\n" |
| 200 | "ldr qU14, [%x[uptr0], u_col_stride3]\n" |
| 201 | "fmul vV32.4s, vU65.4s, vW23.4s\n" |
| 202 | "fmla vV33.4s, vU65.4s, vW21.4s\n" |
| 203 | "ldr qU13, [%x[uptr0], u_col_stride2]\n" |
| 204 | "fmla vV33.4s, vU66.4s, vW22.4s\n" |
| 205 | "ldr qU34, [uptr2, u_col_stride3]\n" |
| 206 | "fmla vV23.4s, vU57.4s, vW33.4s\n" |
| 207 | "fmla vV33.4s, vU57.4s, vW13.4s\n" |
| 208 | "ldr qU33, [uptr2, u_col_stride2]\n" |
| 209 | "fmla vV22.4s, vU55.4s, vW33.4s\n" |
| 210 | "fmla vV23.4s, vU55.4s, vW31.4s\n" |
| 211 | "fmla vV32.4s, vU55.4s, vW13.4s\n" |
| 212 | "fmla vV33.4s, vU55.4s, vW11.4s\n" |
| 213 | "ldr qU24, [uptr1, u_col_stride3]\n" |
| 214 | "fmla vV23.4s, vU56.4s, vW32.4s\n" |
| 215 | "str qV23, [vptr1, v_col_stride2]\n" |
| 216 | "fmla vV33.4s, vU56.4s, vW12.4s\n" |
| 217 | "ldr qU23, [uptr1, u_col_stride2]\n" |
| 218 | "fmla vV33.4s, vU77.4s, vW33.4s\n" |
| 219 | "ldr qU44, [uptr3, u_col_stride3]\n" |
| 220 | "fmla vV32.4s, vU75.4s, vW33.4s\n" |
| 221 | "fmla vV33.4s, vU75.4s, vW31.4s\n" |
| 222 | "ldr qU43, [uptr3, u_col_stride2]\n" |
| 223 | "fmla vV33.4s, vU76.4s, vW32.4s\n" |
| 224 | "str qV33, [vptr2, v_col_stride2]\n" |
| 225 | "ldr qU64, [uptr5, u_col_stride3]\n" |
| 226 | "fmla vV12.4s, vU14.4s, vW12.4s\n" |
| 227 | "ldr qU63, [uptr5, u_col_stride2]\n" |
| 228 | "fmul vV11.4s, vU13.4s, vW13.4s\n" |
| 229 | "fmla vV12.4s, vU13.4s, vW11.4s\n" |
| 230 | "ldr qU54, [uptr4, u_col_stride3]\n" |
| 231 | "fmla vV12.4s, vU34.4s, vW32.4s\n" |
| 232 | "fmla vV22.4s, vU34.4s, vW12.4s\n" |
| 233 | "ldr qU53, [uptr4, u_col_stride2]\n" |
| 234 | "fmla vV11.4s, vU33.4s, vW33.4s\n" |
| 235 | "ldr qU74, [uptr6, u_col_stride3]\n" |
| 236 | "fmla vV12.4s, vU33.4s, vW31.4s\n" |
| 237 | "ldr qU73, [uptr6, u_col_stride2]\n" |
| 238 | "fmul vV21.4s, vU33.4s, vW13.4s\n" |
| 239 | "ldr qU12, [%x[uptr0], u_col_stride1]\n" |
| 240 | "fmla vV22.4s, vU33.4s, vW11.4s\n" |
| 241 | "ldr qU11, [%x[uptr0]], #0x10\n" |
| 242 | "fmla vV12.4s, vU24.4s, vW22.4s\n" |
| 243 | "ldr qU32, [uptr2, u_col_stride1]\n" |
| 244 | "fmla vV11.4s, vU23.4s, vW23.4s\n" |
| 245 | "ldr qU31, [uptr2], #0x10\n" |
| 246 | "fmla vV12.4s, vU23.4s, vW21.4s\n" |
| 247 | "str qV12, [%x[vptr0], v_col_stride1]\n" |
| 248 | "fmla vV22.4s, vU44.4s, vW22.4s\n" |
| 249 | "ldr qU22, [uptr1, u_col_stride1]\n" |
| 250 | "fmla vV21.4s, vU43.4s, vW23.4s\n" |
| 251 | "ldr qU21, [uptr1], #0x10\n" |
| 252 | "fmla vV22.4s, vU43.4s, vW21.4s\n" |
| 253 | "ldr qU42, [uptr3, u_col_stride1]\n" |
| 254 | "fmla vV32.4s, vU64.4s, vW22.4s\n" |
| 255 | "ldr qU41, [uptr3], #0x10\n" |
| 256 | "fmul vV31.4s, vU63.4s, vW23.4s\n" |
| 257 | "ldr qW23, [wptr1, w_col_stride2]\n" |
| 258 | "fmla vV32.4s, vU63.4s, vW21.4s\n" |
| 259 | "ldr qU62, [uptr5, u_col_stride1]\n" |
| 260 | "fmla vV22.4s, vU54.4s, vW32.4s\n" |
| 261 | "ldr qU61, [uptr5], #0x10\n" |
| 262 | "fmla vV32.4s, vU54.4s, vW12.4s\n" |
| 263 | "ldr qU52, [uptr4, u_col_stride1]\n" |
| 264 | "fmla vV21.4s, vU53.4s, vW33.4s\n" |
| 265 | "ldr qU51, [uptr4], #0x10\n" |
| 266 | "fmla vV22.4s, vU53.4s, vW31.4s\n" |
| 267 | "str qV22, [vptr1, v_col_stride1]\n" |
| 268 | "fmla vV31.4s, vU53.4s, vW13.4s\n" |
| 269 | "ldr qW13, [%x[wptr0], w_col_stride2]\n" |
| 270 | "fmla vV32.4s, vU53.4s, vW11.4s\n" |
| 271 | "ldr qU72, [uptr6, u_col_stride1]\n" |
| 272 | "fmla vV32.4s, vU74.4s, vW32.4s\n" |
| 273 | "ldr qU71, [uptr6], #0x10\n" |
| 274 | "fmla vV31.4s, vU73.4s, vW33.4s\n" |
| 275 | "ldr qW33, [wptr2, w_col_stride2]\n" |
| 276 | "fmla vV32.4s, vU73.4s, vW31.4s\n" |
| 277 | "str qV32, [vptr2, v_col_stride1]\n" |
| 278 | "fmla vV11.4s, vU12.4s, vW12.4s\n" |
| 279 | "ldr qU17, [%x[uptr0], u_col_stride6]\n" |
| 280 | "fmla vV11.4s, vU11.4s, vW11.4s\n" |
| 281 | "ldr qU15, [%x[uptr0], u_col_stride4]\n" |
| 282 | "fmla vV11.4s, vU32.4s, vW32.4s\n" |
| 283 | "ldr qU16, [%x[uptr0], u_col_stride5]\n" |
| 284 | "fmla vV21.4s, vU32.4s, vW12.4s\n" |
| 285 | "ldr qU37, [uptr2, u_col_stride6]\n" |
| 286 | "fmla vV11.4s, vU31.4s, vW31.4s\n" |
| 287 | "ldr qU35, [uptr2, u_col_stride4]\n" |
| 288 | "fmla vV21.4s, vU31.4s, vW11.4s\n" |
| 289 | "ldr qU36, [uptr2, u_col_stride5]\n" |
| 290 | "fmla vV11.4s, vU22.4s, vW22.4s\n" |
| 291 | "ldr qU27, [uptr1, u_col_stride6]\n" |
| 292 | "fmla vV11.4s, vU21.4s, vW21.4s\n" |
| 293 | "str qV11, [%x[vptr0]], #0x10\n" |
| 294 | "fmla vV21.4s, vU42.4s, vW22.4s\n" |
| 295 | "ldr qU25, [uptr1, u_col_stride4]\n" |
| 296 | "fmla vV21.4s, vU41.4s, vW21.4s\n" |
| 297 | "fmla vV31.4s, vU62.4s, vW22.4s\n" |
| 298 | "ldr qW22, [wptr1, w_col_stride1]\n" |
| 299 | "fmla vV31.4s, vU61.4s, vW21.4s\n" |
| 300 | "ldr qW21, [wptr1], #0x10\n" |
| 301 | "fmla vV21.4s, vU52.4s, vW32.4s\n" |
| 302 | "fmla vV31.4s, vU52.4s, vW12.4s\n" |
| 303 | "ldr qW12, [%x[wptr0], w_col_stride1]\n" |
| 304 | "fmla vV21.4s, vU51.4s, vW31.4s\n" |
| 305 | "str qV21, [vptr1], #0x10\n" |
| 306 | "fmla vV31.4s, vU51.4s, vW11.4s\n" |
| 307 | "ldr qW11, [%x[wptr0]], #0x10\n" |
| 308 | "fmla vV31.4s, vU72.4s, vW32.4s\n" |
| 309 | "ldr qW32, [wptr2, w_col_stride1]\n" |
| 310 | "fmla vV31.4s, vU71.4s, vW31.4s\n" |
| 311 | "str qV31, [vptr2], #0x10\n" |
| 312 | "fmul vV13.4s, vU17.4s, vW13.4s\n" |
| 313 | "fmul vV12.4s, vU15.4s, vW13.4s\n" |
| 314 | "subs %x[n_iters], %x[n_iters], #1\n" |
| 315 | "fmla vV13.4s, vU15.4s, vW11.4s\n" |
| 316 | "ldr qW31, [wptr2], #0x10\n" |
| 317 | "fmla vV13.4s, vU16.4s, vW12.4s\n" |
| 318 | "ldr qU26, [uptr1, u_col_stride5]\n" |
| 319 | "fmla vV13.4s, vU37.4s, vW33.4s\n" |
| 320 | "ldr qU47, [uptr3, u_col_stride6]\n" |
| 321 | "fmul vV23.4s, vU37.4s, vW13.4s\n" |
| 322 | "ldr qU45, [uptr3, u_col_stride4]\n" |
| 323 | "fmla vV12.4s, vU35.4s, vW33.4s\n" |
| 324 | "ldr qU46, [uptr3, u_col_stride5]\n" |
| 325 | "fmla vV13.4s, vU35.4s, vW31.4s\n" |
| 326 | "ldr qU67, [uptr5, u_col_stride6]\n" |
| 327 | "fmul vV22.4s, vU35.4s, vW13.4s\n" |
| 328 | "bne 1b\n" |
| 329 | |
| 330 | "2:" // Tail iteration |
| 331 | "fmla vV23.4s, vU35.4s, vW11.4s\n" |
| 332 | "ldr qU65, [uptr5, u_col_stride4]\n" |
| 333 | "fmla vV13.4s, vU36.4s, vW32.4s\n" |
| 334 | "fmla vV23.4s, vU36.4s, vW12.4s\n" |
| 335 | "ldr qU66, [uptr5, u_col_stride5]\n" |
| 336 | "fmla vV13.4s, vU27.4s, vW23.4s\n" |
| 337 | "ldr qU57, [uptr4, u_col_stride6]\n" |
| 338 | "fmla vV12.4s, vU25.4s, vW23.4s\n" |
| 339 | "ldr qU55, [uptr4, u_col_stride4]\n" |
| 340 | "fmla vV13.4s, vU25.4s, vW21.4s\n" |
| 341 | "ldr qU56, [uptr4, u_col_stride5]\n" |
| 342 | "fmla vV13.4s, vU26.4s, vW22.4s\n" |
| 343 | "str qV13, [%x[vptr0], v_col_stride2]\n" |
| 344 | "fmla vV23.4s, vU47.4s, vW23.4s\n" |
| 345 | "ldr qU77, [uptr6, u_col_stride6]\n" |
| 346 | "fmla vV22.4s, vU45.4s, vW23.4s\n" |
| 347 | "fmla vV23.4s, vU45.4s, vW21.4s\n" |
| 348 | "ldr qU75, [uptr6, u_col_stride4]\n" |
| 349 | "fmla vV23.4s, vU46.4s, vW22.4s\n" |
| 350 | "ldr qU76, [uptr6, u_col_stride5]\n" |
| 351 | "fmul vV33.4s, vU67.4s, vW23.4s\n" |
| 352 | "ldr qU14, [%x[uptr0], u_col_stride3]\n" |
| 353 | "fmul vV32.4s, vU65.4s, vW23.4s\n" |
| 354 | "fmla vV33.4s, vU65.4s, vW21.4s\n" |
| 355 | "ldr qU13, [%x[uptr0], u_col_stride2]\n" |
| 356 | "fmla vV33.4s, vU66.4s, vW22.4s\n" |
| 357 | "ldr qU34, [uptr2, u_col_stride3]\n" |
| 358 | "fmla vV23.4s, vU57.4s, vW33.4s\n" |
| 359 | "fmla vV33.4s, vU57.4s, vW13.4s\n" |
| 360 | "ldr qU33, [uptr2, u_col_stride2]\n" |
| 361 | "fmla vV22.4s, vU55.4s, vW33.4s\n" |
| 362 | "fmla vV23.4s, vU55.4s, vW31.4s\n" |
| 363 | "fmla vV32.4s, vU55.4s, vW13.4s\n" |
| 364 | "fmla vV33.4s, vU55.4s, vW11.4s\n" |
| 365 | "ldr qU24, [uptr1, u_col_stride3]\n" |
| 366 | "fmla vV23.4s, vU56.4s, vW32.4s\n" |
| 367 | "str qV23, [vptr1, v_col_stride2]\n" |
| 368 | "fmla vV33.4s, vU56.4s, vW12.4s\n" |
| 369 | "ldr qU23, [uptr1, u_col_stride2]\n" |
| 370 | "fmla vV33.4s, vU77.4s, vW33.4s\n" |
| 371 | "ldr qU44, [uptr3, u_col_stride3]\n" |
| 372 | "fmla vV32.4s, vU75.4s, vW33.4s\n" |
| 373 | "fmla vV33.4s, vU75.4s, vW31.4s\n" |
| 374 | "ldr qU43, [uptr3, u_col_stride2]\n" |
| 375 | "fmla vV33.4s, vU76.4s, vW32.4s\n" |
| 376 | "str qV33, [vptr2, v_col_stride2]\n" |
| 377 | "ldr qU64, [uptr5, u_col_stride3]\n" |
| 378 | "fmla vV12.4s, vU14.4s, vW12.4s\n" |
| 379 | "ldr qU63, [uptr5, u_col_stride2]\n" |
| 380 | "fmul vV11.4s, vU13.4s, vW13.4s\n" |
| 381 | "fmla vV12.4s, vU13.4s, vW11.4s\n" |
| 382 | "ldr qU54, [uptr4, u_col_stride3]\n" |
| 383 | "fmla vV12.4s, vU34.4s, vW32.4s\n" |
| 384 | "fmla vV22.4s, vU34.4s, vW12.4s\n" |
| 385 | "ldr qU53, [uptr4, u_col_stride2]\n" |
| 386 | "fmla vV11.4s, vU33.4s, vW33.4s\n" |
| 387 | "ldr qU74, [uptr6, u_col_stride3]\n" |
| 388 | "fmla vV12.4s, vU33.4s, vW31.4s\n" |
| 389 | "ldr qU73, [uptr6, u_col_stride2]\n" |
| 390 | "fmul vV21.4s, vU33.4s, vW13.4s\n" |
| 391 | "ldr qU12, [%x[uptr0], u_col_stride1]\n" |
| 392 | "fmla vV22.4s, vU33.4s, vW11.4s\n" |
| 393 | "ldr qU11, [%x[uptr0]], #0x10\n" |
| 394 | "fmla vV12.4s, vU24.4s, vW22.4s\n" |
| 395 | "ldr qU32, [uptr2, u_col_stride1]\n" |
| 396 | "fmla vV11.4s, vU23.4s, vW23.4s\n" |
| 397 | "ldr qU31, [uptr2], #0x10\n" |
| 398 | "fmla vV12.4s, vU23.4s, vW21.4s\n" |
| 399 | "str qV12, [%x[vptr0], v_col_stride1]\n" |
| 400 | "fmla vV22.4s, vU44.4s, vW22.4s\n" |
| 401 | "ldr qU22, [uptr1, u_col_stride1]\n" |
| 402 | "fmla vV21.4s, vU43.4s, vW23.4s\n" |
| 403 | "ldr qU21, [uptr1], #0x10\n" |
| 404 | "fmla vV22.4s, vU43.4s, vW21.4s\n" |
| 405 | "ldr qU42, [uptr3, u_col_stride1]\n" |
| 406 | "fmla vV32.4s, vU64.4s, vW22.4s\n" |
| 407 | "ldr qU41, [uptr3], #0x10\n" |
| 408 | "fmul vV31.4s, vU63.4s, vW23.4s\n" |
| 409 | "fmla vV32.4s, vU63.4s, vW21.4s\n" |
| 410 | "ldr qU62, [uptr5, u_col_stride1]\n" |
| 411 | "fmla vV22.4s, vU54.4s, vW32.4s\n" |
| 412 | "ldr qU61, [uptr5], #0x10\n" |
| 413 | "fmla vV32.4s, vU54.4s, vW12.4s\n" |
| 414 | "ldr qU52, [uptr4, u_col_stride1]\n" |
| 415 | "fmla vV21.4s, vU53.4s, vW33.4s\n" |
| 416 | "ldr qU51, [uptr4], #0x10\n" |
| 417 | "fmla vV22.4s, vU53.4s, vW31.4s\n" |
| 418 | "str qV22, [vptr1, v_col_stride1]\n" |
| 419 | "fmla vV31.4s, vU53.4s, vW13.4s\n" |
| 420 | "fmla vV32.4s, vU53.4s, vW11.4s\n" |
| 421 | "ldr qU72, [uptr6, u_col_stride1]\n" |
| 422 | "fmla vV32.4s, vU74.4s, vW32.4s\n" |
| 423 | "ldr qU71, [uptr6], #0x10\n" |
| 424 | "fmla vV31.4s, vU73.4s, vW33.4s\n" |
| 425 | "fmla vV32.4s, vU73.4s, vW31.4s\n" |
| 426 | "str qV32, [vptr2, v_col_stride1]\n" |
| 427 | "fmla vV11.4s, vU12.4s, vW12.4s\n" |
| 428 | "fmla vV11.4s, vU11.4s, vW11.4s\n" |
| 429 | "fmla vV11.4s, vU32.4s, vW32.4s\n" |
| 430 | "fmla vV21.4s, vU32.4s, vW12.4s\n" |
| 431 | "fmla vV11.4s, vU31.4s, vW31.4s\n" |
| 432 | "fmla vV21.4s, vU31.4s, vW11.4s\n" |
| 433 | "fmla vV11.4s, vU22.4s, vW22.4s\n" |
| 434 | "fmla vV11.4s, vU21.4s, vW21.4s\n" |
| 435 | "str qV11, [%x[vptr0]], #0x10\n" |
| 436 | "fmla vV21.4s, vU42.4s, vW22.4s\n" |
| 437 | "fmla vV21.4s, vU41.4s, vW21.4s\n" |
| 438 | "fmla vV31.4s, vU62.4s, vW22.4s\n" |
| 439 | "fmla vV31.4s, vU61.4s, vW21.4s\n" |
| 440 | "fmla vV21.4s, vU52.4s, vW32.4s\n" |
| 441 | "fmla vV31.4s, vU52.4s, vW12.4s\n" |
| 442 | "fmla vV21.4s, vU51.4s, vW31.4s\n" |
| 443 | "str qV21, [vptr1], #0x10\n" |
| 444 | "fmla vV31.4s, vU51.4s, vW11.4s\n" |
| 445 | "fmla vV31.4s, vU72.4s, vW32.4s\n" |
| 446 | "fmla vV31.4s, vU71.4s, vW31.4s\n" |
| 447 | "str qV31, [vptr2], #0x10\n" |
| 448 | |
| 449 | // Clear aliases |
| 450 | ".unreq uptr1\n" ".unreq uptr2\n" ".unreq uptr3\n" ".unreq uptr4\n" |
| 451 | ".unreq uptr5\n" ".unreq uptr6\n" |
| 452 | ".unreq u_col_stride1\n" ".unreq u_col_stride2\n" ".unreq u_col_stride3\n" |
| 453 | ".unreq u_col_stride4\n" ".unreq u_col_stride5\n" ".unreq u_col_stride6\n" |
| 454 | ".unreq wptr1\n" ".unreq wptr2\n" |
| 455 | ".unreq w_col_stride1\n" ".unreq w_col_stride2\n" |
| 456 | ".unreq vptr1\n" ".unreq vptr2\n" |
| 457 | ".unreq v_col_stride1\n" ".unreq v_col_stride2\n" |
| 458 | ".unreq qU15\n" ".unreq qU73\n" ".unreq qU45\n" ".unreq qU14\n" |
| 459 | ".unreq qW13\n" ".unreq qU62\n" ".unreq qV12\n" |
| 460 | ".unreq qU51\n" ".unreq qU43\n" ".unreq qU55\n" |
| 461 | ".unreq qU77\n" ".unreq qV13\n" ".unreq qV31\n" ".unreq qU44\n" |
| 462 | ".unreq qV33\n" ".unreq qU46\n" ".unreq qU11\n" ".unreq qU37\n" |
| 463 | ".unreq qU56\n" ".unreq qU25\n" ".unreq qU32\n" |
| 464 | ".unreq qU72\n" ".unreq qV22\n" |
| 465 | ".unreq qU67\n" ".unreq qU61\n" ".unreq qU13\n" ".unreq qW33\n" |
| 466 | ".unreq qU74\n" ".unreq qU34\n" ".unreq qU17\n" ".unreq qU66\n" |
| 467 | ".unreq qU33\n" ".unreq qU57\n" ".unreq qU21\n" |
| 468 | ".unreq qW23\n" ".unreq qU42\n" ".unreq qV23\n" ".unreq qU23\n" |
| 469 | ".unreq qU76\n" ".unreq qU47\n" ".unreq qU64\n" ".unreq qU41\n" |
| 470 | ".unreq qU52\n" ".unreq qU54\n" ".unreq qU75\n" ".unreq qU26\n" |
| 471 | ".unreq qU53\n" ".unreq qU27\n" |
| 472 | ".unreq qV21\n" ".unreq qU65\n" |
| 473 | ".unreq qU31\n" ".unreq qU24\n" ".unreq qU36\n" ".unreq qU22\n" |
| 474 | ".unreq qU35\n" ".unreq qU63\n" ".unreq qW12\n" |
| 475 | ".unreq qV32\n" ".unreq qU16\n" ".unreq qW11\n" ".unreq qU12\n" |
| 476 | ".unreq qW31\n" ".unreq qW22\n" ".unreq qU71\n" ".unreq qV11\n" |
| 477 | ".unreq qW21\n" ".unreq qW32\n" ".unreq vW13\n" |
| 478 | ".unreq vU15\n" ".unreq vU73\n" ".unreq vU45\n" ".unreq vU14\n" |
| 479 | ".unreq vU62\n" ".unreq vV12\n" |
| 480 | ".unreq vU51\n" ".unreq vU43\n" ".unreq vU55\n" |
| 481 | ".unreq vU77\n" ".unreq vV13\n" ".unreq vV31\n" ".unreq vU44\n" |
| 482 | ".unreq vV33\n" ".unreq vU46\n" ".unreq vU11\n" ".unreq vU37\n" |
| 483 | ".unreq vU56\n" ".unreq vU25\n" ".unreq vU32\n" |
| 484 | ".unreq vU72\n" ".unreq vV22\n" ".unreq vW21\n" ".unreq vW32\n" |
| 485 | ".unreq vU67\n" ".unreq vU61\n" ".unreq vU13\n" |
| 486 | ".unreq vU74\n" ".unreq vU34\n" ".unreq vU17\n" ".unreq vU66\n" |
| 487 | ".unreq vU33\n" ".unreq vU57\n" ".unreq vU21\n" ".unreq vW23\n" |
| 488 | ".unreq vU42\n" ".unreq vV23\n" ".unreq vU23\n" ".unreq vW33\n" |
| 489 | ".unreq vU76\n" ".unreq vU47\n" ".unreq vU64\n" ".unreq vU41\n" |
| 490 | ".unreq vU52\n" ".unreq vU54\n" ".unreq vU75\n" ".unreq vU26\n" |
| 491 | ".unreq vU53\n" ".unreq vU27\n" ".unreq vV21\n" ".unreq vU65\n" |
| 492 | ".unreq vU31\n" ".unreq vU24\n" ".unreq vU36\n" ".unreq vU22\n" |
| 493 | ".unreq vU35\n" ".unreq vU63\n" ".unreq vW12\n" |
| 494 | ".unreq vV32\n" ".unreq vU16\n" ".unreq vW11\n" ".unreq vU12\n" |
| 495 | ".unreq vW31\n" ".unreq vW22\n" ".unreq vU71\n" ".unreq vV11\n" |
| 496 | : [uptr0] "+r" (uptr0), [wptr0] "+r" (wptr0), [vptr0] "+r" (vptr0), |
| 497 | [n_iters] "+r" (n_iters) |
| 498 | : [u_row_stride] "r" (in_row_stride * sizeof(float)), |
| 499 | [u_col_stride] "r" (in_col_stride * sizeof(float)), |
| 500 | [w_row_stride] "r" (weight_row_stride * sizeof(float)), |
| 501 | [w_col_stride] "r" (weight_col_stride * sizeof(float)), |
| 502 | [v_row_stride] "r" (out_row_stride * sizeof(float)), |
| 503 | [v_col_stride] "r" (out_col_stride * sizeof(float)) |
| 504 | : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", |
| 505 | "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", |
| 506 | "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x0", |
| 507 | "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", |
| 508 | "x12", "x13", "x14", "x15", "x16", "cc", "memory" |
| 509 | ); |
| 510 | } |
| 511 | if (channels_remaining) |
| 512 | { |
| 513 | // Fall back on the unoptimised version to clean up the tail |
| 514 | ConvImpl::process_tile<false>( |
| 515 | channels_remaining, |
| 516 | wptr0, weight_row_stride, weight_col_stride, |
| 517 | uptr0, in_row_stride, in_col_stride, |
| 518 | vptr0, out_row_stride, out_col_stride, |
| 519 | 0, 0, 0, 0, 0, 0 |
| 520 | ); |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | #endif // __aarch64__ |
| 525 | |
| 526 | template <> |
| 527 | const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>; |
| 528 | |
| 529 | template <> |
| 530 | const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = { |
| 531 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>, |
| 532 | ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>, |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 533 | }; |
| 534 | |
Georgios Pinitas | be0ae93 | 2018-03-13 13:08:12 +0000 | [diff] [blame] | 535 | template <> |
| 536 | const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = { |
| 537 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>, |
| 538 | ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>, |
| 539 | }; |
| 540 | |
| 541 | template <> |
| 542 | const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = { |
| 543 | { |
| 544 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>, |
| 545 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>, |
| 546 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>, |
| 547 | }, |
| 548 | { |
| 549 | ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>, |
| 550 | ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>, |
| 551 | ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>, |
| 552 | }, |
| 553 | { |
| 554 | ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>, |
| 555 | ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>, |
| 556 | ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>, |
| 557 | }, |
| 558 | { |
| 559 | ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>, |
| 560 | ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>, |
| 561 | ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>, |
| 562 | }, |
| 563 | { |
| 564 | ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>, |
| 565 | ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>, |
| 566 | ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>, |
| 567 | }, |
| 568 | { |
| 569 | ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>, |
| 570 | ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>, |
| 571 | ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>, |
| 572 | }, |
| 573 | { |
| 574 | ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>, |
| 575 | ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>, |
| 576 | ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>, |
| 577 | }, |
| 578 | }; |
| 579 | |
| 580 | template <> |
| 581 | const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = { |
| 582 | { |
| 583 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>, |
| 584 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>, |
| 585 | ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>, |
| 586 | }, |
| 587 | { |
| 588 | ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>, |
| 589 | ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>, |
| 590 | ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>, |
| 591 | }, |
| 592 | { |
| 593 | ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>, |
| 594 | ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>, |
| 595 | ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>, |
| 596 | }, |
| 597 | { |
| 598 | ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>, |
| 599 | ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>, |
| 600 | ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>, |
| 601 | }, |
| 602 | { |
| 603 | ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>, |
| 604 | ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>, |
| 605 | ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>, |
| 606 | }, |
| 607 | { |
| 608 | ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>, |
| 609 | ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>, |
| 610 | ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>, |
| 611 | }, |
| 612 | { |
| 613 | ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>, |
| 614 | ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>, |
| 615 | ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>, |
| 616 | }, |
| 617 | }; |
| 618 | |
| 619 | template <> |
| 620 | const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>; |
Georgios Pinitas | 4074c99 | 2018-01-30 18:13:46 +0000 | [diff] [blame] | 621 | |
| 622 | template class DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>; |
| 623 | } // namespace depthwise |