blob: 2b3e170a3b8998bbdd7078484a406fa339ad2abb [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
2 * Copyright (c) 2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "asmlib.hpp"
26#include "convolution_parameters.hpp"
27#include "convolver.hpp"
28#include "interleave_indirect.hpp"
29#include "bfloat.hpp"
30
31#include <alloca.h>
32
33#include <algorithm>
34#include <cstddef>
35#include <cstdint>
36#include <cstdio>
37#include <cstring>
38#include <tuple>
39#include <type_traits>
40#include <vector>
41
42#include <arm_neon.h>
43
44#include "utils.hpp"
45
46namespace arm_gemm {
47
48/*
49 * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
50 *
51 * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
52 * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
53 * with a particular value.
54 *
55 * Note that it is not expected for this templated version to ever be used - all cases that matter should be
56 * explicitly specialized with an optimized implementation.
57 */
58template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
59void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
60 const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
61
62 std::vector<int32_t> the_sums;
63
64 if (integrate_sums) {
65 the_sums = std::vector<int32_t>(int_by, 0);
66
67 if (!first) {
68 // In 'integrate sums' mode, we dump the sums at the end on each pass.
69
70 // On the last pass this is correct, but on other passes it is not -
71 // so on the subsequent pass we need to take the output written by
72 // the previous pass as starting point for the sums, and then
73 // overwrite them with new interleaved data.
74 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
75
76 // Rewind pointer to where we wrote out the sums last time.
77 out_int32 -= int_by;
78
79 // Restore the running sums.
80 memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
81
82 // Update the "real" pointer so that the next output will clobber the old sums.
83 out = reinterpret_cast<TOut *>(out_int32);
84 }
85 }
86
87 for (unsigned int pos=0; pos<width; pos+=block) {
88 for (unsigned int row=0; row<int_by; row++) {
89 // Row out of range - pad 'block' entries.
90 if (row >= height) {
91 for (unsigned int col=0; col<block; col++) {
92 *out++ = 0;
93 }
94 continue;
95 }
96
97 for (unsigned int col=0; col<block; col++) {
98 // Column out of range - pad a single entry
99 if (pos + col >= width) {
100 *out++ = 0;
101 continue;
102 }
103
104 if (integrate_sums) {
105 the_sums[row] += in[row][row_offset + pos + col];
106 }
107
108 *out++ = in[row][row_offset + pos + col];
109 }
110 }
111 }
112
113 if (integrate_sums) {
114 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
115
116 memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
117
118 out = reinterpret_cast<TOut *>(out_int32 + int_by);
119 }
120}
121
122template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
123inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
124 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
125
126 // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
127 if (row_sum_multiplier) {
128 // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
129 // next block (post sums).
130 // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
131 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
132
133 out_int32 -= height;
134 for (unsigned int i=0; i<height; i++) {
135 out_int32[i] *= row_sum_multiplier;
136 }
137 } else {
138 // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
139 // sum block. We need to insert the (zero) sums, and advance 'out'.
140 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
141
142 for (unsigned int i=0; i<height; i++) {
143 out_int32[i] = 0;
144 }
145
146 out_int32 += height;
147
148 out = reinterpret_cast<TOut *>(out_int32);
149 }
150}
151
152template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
153void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
154 unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
155 const unsigned int k0, const unsigned int kmax, bool integrate_sums,
156 const int32_t row_sum_multiplier) {
157 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
158
159 // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
160 // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
161 // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
162 // pointers and conditionally overriding the out of range ones.
163
164 // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
165 // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
166 // expensive in highly threaded scenarios.
167 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
168
169 // Figure out the starting position based on k0 (with rounded length)
170 unsigned int start_string = k0 / rounded_stringlen;
171 unsigned int start_stringpos = k0 % rounded_stringlen;
172
173 // Process blocks of 'height' height...
174 for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
175 // Height to process
176 unsigned int active_height = std::min(ymax - ybase, height);
177
178 // Track our progress through the various strings
179 unsigned int k_left = (kmax - k0);
180 unsigned int string = start_string;
181 unsigned int stringpos = start_stringpos;
182
183 bool first = true;
184
185 // Prepare to call 'interleave_block' above for each string encompassed by K range
186 while (k_left > 0) {
187 // Width to process - and the width we will generate (with padding)
188 unsigned int in_width = std::min(k_left, stringlen - stringpos);
189 unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
190
191 const TIn * const *row_base = ptr[string] + ybase;
192
193 // If not all rows are valid, copy the ones that are into local array (see above comment).
194 if (active_height < height) {
195 for (unsigned int i=0; i<active_height; i++) {
196 row_ptrs[i] = ptr[string][ybase + i];
197 }
198
199 row_base = row_ptrs;
200 }
201
202 // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
203 // much code. However, integrated sums make no sense for non-integral types and won't ever be
204 // requested. So put a type trait check here to avoid generating pointless code.
205 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
206 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
207 } else {
208 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
209 }
210
211 k_left -= out_width;
212 string++;
213 stringpos=0;
214 first=false;
215 }
216
217 if (std::is_integral<TOut>::value && integrate_sums) {
218 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
219 }
220 }
221}
222
223template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
224void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
225 const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
226 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
227
228 auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
229
230 // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
231 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
232
233 for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
234 // How many of the rows are active - the rest will get padded in interleave_block.
235 unsigned int active_height = std::min(ymax - ybase, height);
236 bool first = true;
237
238 auto conv_rows = conv_cols.process_rows(ybase, active_height);
239
240 while (!conv_rows.finished()) {
241 unsigned int width, offset;
242
243 // Get next set of parameters
244 std::tie(width, offset) = conv_rows.next_block(row_ptrs);
245
246 // Perform the interleave
247 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
248 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
249 } else {
250 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
251 }
252
253 first=false;
254 }
255
256 if (std::is_integral<TOut>::value && integrate_sums) {
257 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
258 }
259 }
260}
261
262template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
263void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
264 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
265
266 // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
267 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
268
269 const unsigned int width=kmax-k0;
270
271 for (unsigned int y=y0; y<ymax; y+=height) {
272 for (unsigned int r=0; r<height; r++) {
273 row_ptrs[r] = in + ((y + r) * in_stride);
274 }
275
276 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
277 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
278 } else {
279 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280 }
281
282 if (std::is_integral<TOut>::value && integrate_sums) {
283 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
284 }
285 }
286}
287
288#include "indirect-interleaves/list.hpp"
289
290/**** Instantiate needed implementations ****/
291
292/* AArch32 */
293#ifdef __arm__
294/* FP32 */
295/* NEON implementation (height 6) */
296template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
297template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
298template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
299
300/* FP16 */
301#if __ARM_FP16_ARGS
302/* NEON implementation using FP32 kernel (height 6) */
303template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
304template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
305template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
306#endif /* __ARM_FP16_ARGS */
307
308/* BF16 */
309/* NEON implementation using FP32 kernel */
310template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
311template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
312template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
313#endif
314
315/* AArch64 */
316#ifdef __aarch64__
317/* FP64 */
318/* NEON/SVE implementation (height 8) */
319template void IndirectInterleave<8, 1, VLType::None>(double *, const double * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
320template void ConvolutionInterleave<8, 1, VLType::None>(double *, const double *, size_t, const convolver<double> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
321template void Interleave<8, 1, VLType::None>(double *, const double *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
322
323/* FP32 */
324/* NEON/SVE implementation (height 8) */
325template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
326template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
327template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
328
329/* FMMLA */
330template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
331template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
332template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
333
334/* FP16 */
335template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
336template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
337template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
338
339template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
340template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
341template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
342
343/* BF16 */
344/* NEON/SVE BFDOT */
345template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
346template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
347template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
348
349template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
350template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
351template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
352
353/* NEON/SVE using FP32 kernel */
354template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
355template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
356template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
357
358/* INT16 */
359template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
360template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
361template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
362
363template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
364template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
365template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
366
367/* INT8 */
368/* NEON SMLA/SMLAL (height 4, block 16) */
369template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
370template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
371template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
372
373/* NEON SDOT (height 8, block 4) */
374template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
375template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
376template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
377
378/* MMLA SMMLA (height 8, block 8) */
379template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
380template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
381template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
382
383/* NEON SDOT (height 8, block 1) */
384template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
385template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
386template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
387
388/* NEON SMLA/SMLAL (height 4, block 16) */
389template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
390template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
391template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
392
393/* NEON SDOT (height 8, block 4) */
394template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
395template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
396template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
397
398/* MMLA SMMLA (height 8, block 8) */
399template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
400template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
401template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
402
403/* NEON 16-bit (height 8, block 1) */
404template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
405template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
406template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
407#endif // __aarch64__
408
409} // namespace arm_gemm