Blame - src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp - ml/ComputeLibrary

blob: 2b3e170a3b8998bbdd7078484a406fa339ad2abb [file] [log] [blame]

Georgios Pinitas	c0b6f76	2020-11-02 01:37:17 +0000	[diff] [blame^]	1	/*
				2	* Copyright (c) 2020 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	#include "asmlib.hpp"
				26	#include "convolution_parameters.hpp"
				27	#include "convolver.hpp"
				28	#include "interleave_indirect.hpp"
				29	#include "bfloat.hpp"
				30
				31	#include <alloca.h>
				32
				33	#include <algorithm>
				34	#include <cstddef>
				35	#include <cstdint>
				36	#include <cstdio>
				37	#include <cstring>
				38	#include <tuple>
				39	#include <type_traits>
				40	#include <vector>
				41
				42	#include <arm_neon.h>
				43
				44	#include "utils.hpp"
				45
				46	namespace arm_gemm {
				47
				48	/*
				49	* Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
				50	*
				51	* 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
				52	* entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
				53	* with a particular value.
				54	*
				55	* Note that it is not expected for this templated version to ever be used - all cases that matter should be
				56	* explicitly specialized with an optimized implementation.
				57	*/
				58	template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
				59	void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
				60	const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
				61
				62	std::vector<int32_t> the_sums;
				63
				64	if (integrate_sums) {
				65	the_sums = std::vector<int32_t>(int_by, 0);
				66
				67	if (!first) {
				68	// In 'integrate sums' mode, we dump the sums at the end on each pass.
				69
				70	// On the last pass this is correct, but on other passes it is not -
				71	// so on the subsequent pass we need to take the output written by
				72	// the previous pass as starting point for the sums, and then
				73	// overwrite them with new interleaved data.
				74	int32_t out_int32 = reinterpret_cast<int32_t >(out);
				75
				76	// Rewind pointer to where we wrote out the sums last time.
				77	out_int32 -= int_by;
				78
				79	// Restore the running sums.
				80	memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
				81
				82	// Update the "real" pointer so that the next output will clobber the old sums.
				83	out = reinterpret_cast<TOut *>(out_int32);
				84	}
				85	}
				86
				87	for (unsigned int pos=0; pos<width; pos+=block) {
				88	for (unsigned int row=0; row<int_by; row++) {
				89	// Row out of range - pad 'block' entries.
				90	if (row >= height) {
				91	for (unsigned int col=0; col<block; col++) {
				92	*out++ = 0;
				93	}
				94	continue;
				95	}
				96
				97	for (unsigned int col=0; col<block; col++) {
				98	// Column out of range - pad a single entry
				99	if (pos + col >= width) {
				100	*out++ = 0;
				101	continue;
				102	}
				103
				104	if (integrate_sums) {
				105	the_sums[row] += in[row][row_offset + pos + col];
				106	}
				107
				108	*out++ = in[row][row_offset + pos + col];
				109	}
				110	}
				111	}
				112
				113	if (integrate_sums) {
				114	int32_t out_int32 = reinterpret_cast<int32_t >(out);
				115
				116	memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
				117
				118	out = reinterpret_cast<TOut *>(out_int32 + int_by);
				119	}
				120	}
				121
				122	template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
				123	inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
				124	const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
				125
				126	// If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
				127	if (row_sum_multiplier) {
				128	// Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
				129	// next block (post sums).
				130	// We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
				131	int32_t out_int32 = reinterpret_cast<int32_t >(out);
				132
				133	out_int32 -= height;
				134	for (unsigned int i=0; i<height; i++) {
				135	out_int32[i] *= row_sum_multiplier;
				136	}
				137	} else {
				138	// Zero: interleave_block<>() will not have done the sums, so 'out' will point to the start of the
				139	// sum block. We need to insert the (zero) sums, and advance 'out'.
				140	int32_t out_int32 = reinterpret_cast<int32_t >(out);
				141
				142	for (unsigned int i=0; i<height; i++) {
				143	out_int32[i] = 0;
				144	}
				145
				146	out_int32 += height;
				147
				148	out = reinterpret_cast<TOut *>(out_int32);
				149	}
				150	}
				151
				152	template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
				153	void IndirectInterleave(TOut out, const TIn const * const *ptr, unsigned int stringlen,
				154	unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
				155	const unsigned int k0, const unsigned int kmax, bool integrate_sums,
				156	const int32_t row_sum_multiplier) {
				157	const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
				158
				159	// 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
				160	// pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
				161	// out of range rows). This allows interleave_block to use techniques like row predication, or loading all
				162	// pointers and conditionally overriding the out of range ones.
				163
				164	// This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
				165	// range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
				166	// expensive in highly threaded scenarios.
				167	const TIn row_ptrs = reinterpret_cast<const TIn >(alloca(height * sizeof(const TIn *)));
				168
				169	// Figure out the starting position based on k0 (with rounded length)
				170	unsigned int start_string = k0 / rounded_stringlen;
				171	unsigned int start_stringpos = k0 % rounded_stringlen;
				172
				173	// Process blocks of 'height' height...
				174	for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
				175	// Height to process
				176	unsigned int active_height = std::min(ymax - ybase, height);
				177
				178	// Track our progress through the various strings
				179	unsigned int k_left = (kmax - k0);
				180	unsigned int string = start_string;
				181	unsigned int stringpos = start_stringpos;
				182
				183	bool first = true;
				184
				185	// Prepare to call 'interleave_block' above for each string encompassed by K range
				186	while (k_left > 0) {
				187	// Width to process - and the width we will generate (with padding)
				188	unsigned int in_width = std::min(k_left, stringlen - stringpos);
				189	unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
				190
				191	const TIn * const *row_base = ptr[string] + ybase;
				192
				193	// If not all rows are valid, copy the ones that are into local array (see above comment).
				194	if (active_height < height) {
				195	for (unsigned int i=0; i<active_height; i++) {
				196	row_ptrs[i] = ptr[string][ybase + i];
				197	}
				198
				199	row_base = row_ptrs;
				200	}
				201
				202	// 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
				203	// much code. However, integrated sums make no sense for non-integral types and won't ever be
				204	// requested. So put a type trait check here to avoid generating pointless code.
				205	if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
				206	interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
				207	} else {
				208	interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
				209	}
				210
				211	k_left -= out_width;
				212	string++;
				213	stringpos=0;
				214	first=false;
				215	}
				216
				217	if (std::is_integral<TOut>::value && integrate_sums) {
				218	FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
				219	}
				220	}
				221	}
				222
				223	template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
				224	void ConvolutionInterleave(TOut out, const TIn in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
				225	const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
				226	const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
				227
				228	auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
				229
				230	// Use alloca here as a std::vector can be expensive in highly threaded scenarios.
				231	const TIn row_ptrs = reinterpret_cast<const TIn >(alloca(height * sizeof(const TIn *)));
				232
				233	for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
				234	// How many of the rows are active - the rest will get padded in interleave_block.
				235	unsigned int active_height = std::min(ymax - ybase, height);
				236	bool first = true;
				237
				238	auto conv_rows = conv_cols.process_rows(ybase, active_height);
				239
				240	while (!conv_rows.finished()) {
				241	unsigned int width, offset;
				242
				243	// Get next set of parameters
				244	std::tie(width, offset) = conv_rows.next_block(row_ptrs);
				245
				246	// Perform the interleave
				247	if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
				248	interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
				249	} else {
				250	interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
				251	}
				252
				253	first=false;
				254	}
				255
				256	if (std::is_integral<TOut>::value && integrate_sums) {
				257	FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
				258	}
				259	}
				260	}
				261
				262	template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
				263	void Interleave(TOut out, const TIn in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
				264	const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
				265
				266	// Use alloca here as a std::vector can be expensive in highly threaded scenarios.
				267	const TIn row_ptrs = reinterpret_cast<const TIn >(alloca(height * sizeof(const TIn *)));
				268
				269	const unsigned int width=kmax-k0;
				270
				271	for (unsigned int y=y0; y<ymax; y+=height) {
				272	for (unsigned int r=0; r<height; r++) {
				273	row_ptrs[r] = in + ((y + r) * in_stride);
				274	}
				275
				276	if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
				277	interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
				278	} else {
				279	interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
				280	}
				281
				282	if (std::is_integral<TOut>::value && integrate_sums) {
				283	FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
				284	}
				285	}
				286	}
				287
				288	#include "indirect-interleaves/list.hpp"
				289
				290	/** Instantiate needed implementations **/
				291
				292	/* AArch32 */
				293	#ifdef __arm__
				294	/* FP32 */
				295	/* NEON implementation (height 6) */
				296	template void IndirectInterleave<6, 1, VLType::None>(float , const float const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				297	template void ConvolutionInterleave<6, 1, VLType::None>(float , const float , size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				298	template void Interleave<6, 1, VLType::None>(float , const float , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				299
				300	/* FP16 */
				301	#if __ARM_FP16_ARGS
				302	/* NEON implementation using FP32 kernel (height 6) */
				303	template void IndirectInterleave<6, 1, VLType::None>(float , const __fp16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				304	template void ConvolutionInterleave<6, 1, VLType::None>(float , const __fp16 , size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				305	template void Interleave<6, 1, VLType::None>(float , const __fp16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				306	#endif /* __ARM_FP16_ARGS */
				307
				308	/* BF16 */
				309	/* NEON implementation using FP32 kernel */
				310	template void IndirectInterleave<6, 1, VLType::None>(float , const bfloat16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				311	template void ConvolutionInterleave<6, 1, VLType::None>(float , const bfloat16 , size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				312	template void Interleave<6, 1, VLType::None>(float , const bfloat16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				313	#endif
				314
				315	/* AArch64 */
				316	#ifdef __aarch64__
				317	/* FP64 */
				318	/* NEON/SVE implementation (height 8) */
				319	template void IndirectInterleave<8, 1, VLType::None>(double , const double const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				320	template void ConvolutionInterleave<8, 1, VLType::None>(double , const double , size_t, const convolver<double> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				321	template void Interleave<8, 1, VLType::None>(double , const double , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				322
				323	/* FP32 */
				324	/* NEON/SVE implementation (height 8) */
				325	template void IndirectInterleave<8, 1, VLType::None>(float , const float const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				326	template void ConvolutionInterleave<8, 1, VLType::None>(float , const float , size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				327	template void Interleave<8, 1, VLType::None>(float , const float , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				328
				329	/* FMMLA */
				330	template void IndirectInterleave<8, 2, VLType::None>(float , const float const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				331	template void ConvolutionInterleave<8, 2, VLType::None>(float , const float , size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				332	template void Interleave<8, 2, VLType::None>(float , const float , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				333
				334	/* FP16 */
				335	template void IndirectInterleave<8, 1, VLType::None>(__fp16 , const __fp16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				336	template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 , const __fp16 , size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				337	template void Interleave<8, 1, VLType::None>(__fp16 , const __fp16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				338
				339	template void IndirectInterleave<8, 1, VLType::None>(float , const __fp16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				340	template void ConvolutionInterleave<8, 1, VLType::None>(float , const __fp16 , size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				341	template void Interleave<8, 1, VLType::None>(float , const __fp16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				342
				343	/* BF16 */
				344	/* NEON/SVE BFDOT */
				345	template void IndirectInterleave<8, 2, VLType::None>(bfloat16 , const bfloat16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				346	template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 , const bfloat16 , size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				347	template void Interleave<8, 2, VLType::None>(bfloat16 , const bfloat16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				348
				349	template void IndirectInterleave<8, 4, VLType::None>(bfloat16 , const bfloat16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				350	template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 , const bfloat16 , size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				351	template void Interleave<8, 4, VLType::None>(bfloat16 , const bfloat16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				352
				353	/* NEON/SVE using FP32 kernel */
				354	template void IndirectInterleave<8, 1, VLType::None>(float , const bfloat16 const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				355	template void ConvolutionInterleave<8, 1, VLType::None>(float , const bfloat16 , size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				356	template void Interleave<8, 1, VLType::None>(float , const bfloat16 , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				357
				358	/* INT16 */
				359	template void IndirectInterleave<8, 1, VLType::None>(int16_t , const int16_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				360	template void ConvolutionInterleave<8, 1, VLType::None>(int16_t , const int16_t , size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				361	template void Interleave<8, 1, VLType::None>(int16_t , const int16_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				362
				363	template void IndirectInterleave<8, 1, VLType::None>(uint16_t , const uint16_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				364	template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t , const uint16_t , size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				365	template void Interleave<8, 1, VLType::None>(uint16_t , const uint16_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				366
				367	/* INT8 */
				368	/* NEON SMLA/SMLAL (height 4, block 16) */
				369	template void IndirectInterleave<4, 16, VLType::None>(int8_t , const int8_t const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				370	template void ConvolutionInterleave<4, 16, VLType::None>(int8_t , const int8_t , size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				371	template void Interleave<4, 16, VLType::None>(int8_t , const int8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				372
				373	/* NEON SDOT (height 8, block 4) */
				374	template void IndirectInterleave<8, 4, VLType::None>(int8_t , const int8_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				375	template void ConvolutionInterleave<8, 4, VLType::None>(int8_t , const int8_t , size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				376	template void Interleave<8, 4, VLType::None>(int8_t , const int8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				377
				378	/* MMLA SMMLA (height 8, block 8) */
				379	template void IndirectInterleave<8, 8, VLType::None>(int8_t , const int8_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				380	template void ConvolutionInterleave<8, 8, VLType::None>(int8_t , const int8_t , size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				381	template void Interleave<8, 8, VLType::None>(int8_t , const int8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				382
				383	/* NEON SDOT (height 8, block 1) */
				384	template void IndirectInterleave<8, 1, VLType::None>(int16_t , const int8_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				385	template void ConvolutionInterleave<8, 1, VLType::None>(int16_t , const int8_t , size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				386	template void Interleave<8, 1, VLType::None>(int16_t , const int8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				387
				388	/* NEON SMLA/SMLAL (height 4, block 16) */
				389	template void IndirectInterleave<4, 16, VLType::None>(uint8_t , const uint8_t const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				390	template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t , const uint8_t , size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				391	template void Interleave<4, 16, VLType::None>(uint8_t , const uint8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				392
				393	/* NEON SDOT (height 8, block 4) */
				394	template void IndirectInterleave<8, 4, VLType::None>(uint8_t , const uint8_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				395	template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t , const uint8_t , size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				396	template void Interleave<8, 4, VLType::None>(uint8_t , const uint8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				397
				398	/* MMLA SMMLA (height 8, block 8) */
				399	template void IndirectInterleave<8, 8, VLType::None>(uint8_t , const uint8_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				400	template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t , const uint8_t , size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				401	template void Interleave<8, 8, VLType::None>(uint8_t , const uint8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				402
				403	/* NEON 16-bit (height 8, block 1) */
				404	template void IndirectInterleave<8, 1, VLType::None>(uint16_t , const uint8_t const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
				405	template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t , const uint8_t , size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				406	template void Interleave<8, 1, VLType::None>(uint16_t , const uint8_t , size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
				407	#endif // __aarch64__
				408
				409	} // namespace arm_gemm