blob: 59591935cd36d900c5af33df0648ca79fff56fd4 [file] [log] [blame]
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00001/*
Pablo Tello4e66d702022-03-07 18:20:12 +00002 * Copyright (c) 2020-2022 Arm Limited.
Georgios Pinitasc0b6f762020-11-02 01:37:17 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "asmlib.hpp"
26#include "convolution_parameters.hpp"
27#include "convolver.hpp"
28#include "interleave_indirect.hpp"
29#include "bfloat.hpp"
30
Pablo Tello4e66d702022-03-07 18:20:12 +000031#if !defined(_WIN64) && !defined(__OpenBSD__)
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000032#include <alloca.h>
Pablo Tello4e66d702022-03-07 18:20:12 +000033#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +000034
35#include <algorithm>
36#include <cstddef>
37#include <cstdint>
38#include <cstdio>
39#include <cstring>
40#include <tuple>
41#include <type_traits>
42#include <vector>
43
44#include <arm_neon.h>
45
46#include "utils.hpp"
47
48namespace arm_gemm {
49
50/*
51 * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
52 *
53 * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
54 * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
55 * with a particular value.
56 *
57 * Note that it is not expected for this templated version to ever be used - all cases that matter should be
58 * explicitly specialized with an optimized implementation.
59 */
60template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
61void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
62 const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
63
64 std::vector<int32_t> the_sums;
65
66 if (integrate_sums) {
67 the_sums = std::vector<int32_t>(int_by, 0);
68
69 if (!first) {
70 // In 'integrate sums' mode, we dump the sums at the end on each pass.
71
72 // On the last pass this is correct, but on other passes it is not -
73 // so on the subsequent pass we need to take the output written by
74 // the previous pass as starting point for the sums, and then
75 // overwrite them with new interleaved data.
76 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
77
78 // Rewind pointer to where we wrote out the sums last time.
79 out_int32 -= int_by;
80
81 // Restore the running sums.
82 memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
83
84 // Update the "real" pointer so that the next output will clobber the old sums.
85 out = reinterpret_cast<TOut *>(out_int32);
86 }
87 }
88
89 for (unsigned int pos=0; pos<width; pos+=block) {
90 for (unsigned int row=0; row<int_by; row++) {
91 // Row out of range - pad 'block' entries.
92 if (row >= height) {
93 for (unsigned int col=0; col<block; col++) {
94 *out++ = 0;
95 }
96 continue;
97 }
98
99 for (unsigned int col=0; col<block; col++) {
100 // Column out of range - pad a single entry
101 if (pos + col >= width) {
102 *out++ = 0;
103 continue;
104 }
105
106 if (integrate_sums) {
107 the_sums[row] += in[row][row_offset + pos + col];
108 }
109
110 *out++ = in[row][row_offset + pos + col];
111 }
112 }
113 }
114
115 if (integrate_sums) {
116 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
117
118 memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
119
120 out = reinterpret_cast<TOut *>(out_int32 + int_by);
121 }
122}
123
124template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
125inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
126 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
127
128 // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
129 if (row_sum_multiplier) {
130 // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
131 // next block (post sums).
132 // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
133 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
134
135 out_int32 -= height;
136 for (unsigned int i=0; i<height; i++) {
137 out_int32[i] *= row_sum_multiplier;
138 }
139 } else {
140 // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
141 // sum block. We need to insert the (zero) sums, and advance 'out'.
142 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
143
144 for (unsigned int i=0; i<height; i++) {
145 out_int32[i] = 0;
146 }
147
148 out_int32 += height;
149
150 out = reinterpret_cast<TOut *>(out_int32);
151 }
152}
153
154template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
155void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
156 unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
157 const unsigned int k0, const unsigned int kmax, bool integrate_sums,
158 const int32_t row_sum_multiplier) {
159 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
160
161 // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
162 // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
163 // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
164 // pointers and conditionally overriding the out of range ones.
165
166 // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
167 // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
168 // expensive in highly threaded scenarios.
169 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
170
171 // Figure out the starting position based on k0 (with rounded length)
172 unsigned int start_string = k0 / rounded_stringlen;
173 unsigned int start_stringpos = k0 % rounded_stringlen;
174
175 // Process blocks of 'height' height...
176 for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
177 // Height to process
178 unsigned int active_height = std::min(ymax - ybase, height);
179
180 // Track our progress through the various strings
181 unsigned int k_left = (kmax - k0);
182 unsigned int string = start_string;
183 unsigned int stringpos = start_stringpos;
184
185 bool first = true;
186
187 // Prepare to call 'interleave_block' above for each string encompassed by K range
188 while (k_left > 0) {
189 // Width to process - and the width we will generate (with padding)
190 unsigned int in_width = std::min(k_left, stringlen - stringpos);
191 unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
192
193 const TIn * const *row_base = ptr[string] + ybase;
194
195 // If not all rows are valid, copy the ones that are into local array (see above comment).
196 if (active_height < height) {
197 for (unsigned int i=0; i<active_height; i++) {
198 row_ptrs[i] = ptr[string][ybase + i];
199 }
200
201 row_base = row_ptrs;
202 }
203
204 // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
205 // much code. However, integrated sums make no sense for non-integral types and won't ever be
206 // requested. So put a type trait check here to avoid generating pointless code.
207 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
208 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
209 } else {
210 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
211 }
212
213 k_left -= out_width;
214 string++;
215 stringpos=0;
216 first=false;
217 }
218
219 if (std::is_integral<TOut>::value && integrate_sums) {
220 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
221 }
222 }
223}
224
225template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
226void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
227 const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
228 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
229
230 auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
231
232 // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
233 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
234
235 for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
236 // How many of the rows are active - the rest will get padded in interleave_block.
237 unsigned int active_height = std::min(ymax - ybase, height);
238 bool first = true;
239
240 auto conv_rows = conv_cols.process_rows(ybase, active_height);
241
242 while (!conv_rows.finished()) {
243 unsigned int width, offset;
244
245 // Get next set of parameters
246 std::tie(width, offset) = conv_rows.next_block(row_ptrs);
247
248 // Perform the interleave
249 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
250 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
251 } else {
252 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
253 }
254
255 first=false;
256 }
257
258 if (std::is_integral<TOut>::value && integrate_sums) {
259 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
260 }
261 }
262}
263
264template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
265void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
266 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
267
268 // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
269 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
270
271 const unsigned int width=kmax-k0;
272
273 for (unsigned int y=y0; y<ymax; y+=height) {
274 for (unsigned int r=0; r<height; r++) {
275 row_ptrs[r] = in + ((y + r) * in_stride);
276 }
277
278 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
279 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280 } else {
281 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
282 }
283
284 if (std::is_integral<TOut>::value && integrate_sums) {
285 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
286 }
287 }
288}
289
290#include "indirect-interleaves/list.hpp"
291
292/**** Instantiate needed implementations ****/
293
294/* AArch32 */
295#ifdef __arm__
296/* FP32 */
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000297/* Arm® Neon™ implementation (height 6) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000298template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
299template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
300template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
301
302/* FP16 */
303#if __ARM_FP16_ARGS
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000304/* Arm® Neon™ implementation using FP32 kernel (height 6) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000305template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
306template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
307template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
308#endif /* __ARM_FP16_ARGS */
309
310/* BF16 */
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000311/* Arm® Neon™ implementation using FP32 kernel */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000312template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
313template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
314template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
315#endif
316
317/* AArch64 */
318#ifdef __aarch64__
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000319/* FP32 */
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000320/* Arm® Neon™/SVE implementation (height 8) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000321template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
322template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
323template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
324
Michalis Spyrou20fca522021-06-07 14:23:57 +0100325#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM)
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000326/* FMMLA */
327template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
328template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
329template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
Michalis Spyrou20fca522021-06-07 14:23:57 +0100330#endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000331
332/* FP16 */
Georgios Pinitas33e03072021-01-14 13:43:40 +0000333#if defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000334template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
335template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
336template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
Georgios Pinitas33e03072021-01-14 13:43:40 +0000337#endif // FP16_KERNELS ar __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000338
339template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
340template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
341template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
342
343/* BF16 */
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000344/* Arm® Neon™/SVE BFDOT */
Michalis Spyrou20fca522021-06-07 14:23:57 +0100345#ifdef ARM_COMPUTE_ENABLE_BF16
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000346template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
347template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
348template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
349
350template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
351template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
352template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
Georgios Pinitas4ee8b152021-07-16 16:16:43 +0100353
354template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
355template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
356template void Interleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
Michalis Spyrou20fca522021-06-07 14:23:57 +0100357#endif // ARM_COMPUTE_ENABLE_BF16
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000358
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000359/* Arm® Neon™/SVE using FP32 kernel */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000360template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
361template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
362template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
363
364/* INT16 */
365template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
366template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
367template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
368
369template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
370template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
371template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
372
373/* INT8 */
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000374/* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000375template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
376template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
377template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
378
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000379/* Arm® Neon™ SDOT (height 8, block 4) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000380template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
381template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
382template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
383
384/* MMLA SMMLA (height 8, block 8) */
385template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
386template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
387template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
388
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000389/* Arm® Neon™ SDOT (height 8, block 1) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000390template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
391template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
392template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
393
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000394/* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000395template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
396template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
397template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
398
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000399/* Arm® Neon™ SDOT (height 8, block 4) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000400template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
401template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
402template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
403
404/* MMLA SMMLA (height 8, block 8) */
405template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
406template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
407template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000408
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000409/* Arm® Neon™ 16-bit (height 8, block 1) */
Georgios Pinitasc0b6f762020-11-02 01:37:17 +0000410template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
411template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
412template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
413#endif // __aarch64__
414
415} // namespace arm_gemm