Blame - src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp - ml/ComputeLibrary

blob: 68e20d98a9c720cc899a0c510cdb74df78a3baf2 [file] [log] [blame]

Giuseppe Rossini	f01201a	2019-11-06 14:57:49 +0000	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame^]	2	* Copyright (c) 2019 Arm Limited.
Giuseppe Rossini	f01201a	2019-11-06 14:57:49 +0000	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
				33	#include <limits>
				34
				35	#include "arm.hpp"
				36	#include "impl_base.hpp"
				37	#include "depthwise_quantized.hpp"
				38
				39	#pragma once
				40
				41	namespace {
				42
				43	template <
				44	unsigned int OutputTileRows, unsigned int OutputTileCols,
				45	unsigned int KernelRows, unsigned int KernelCols,
				46	unsigned int StrideRows, unsigned int StrideCols,
				47	typename FInput, typename FOutput
				48	>
				49	static inline void tilefn_hybrid(
				50	int n_channels,
				51	const void* packed_params,
				52	FInput &get_input_ptr,
				53	FOutput &get_output_ptr,
				54	int32_t clamp_min,
				55	int32_t clamp_max,
				56	uint8_t input_offset,
				57	uint8_t output_offset
				58	)
				59	{
				60	constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
				61	constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
				62
				63	// Offset into channels
				64	int channel = 0;
				65
				66	// Byte type pointer to weights and biases
				67	const int8_t wbptr = static_cast<const int8_t >(packed_params);
				68
				69	for (; n_channels >= 8; n_channels -= 8, channel += 8)
				70	{
				71	const int32x4_t biases[2] = {
				72	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				73	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				74	};
				75	const int32x4_t multipliers[2] = {
				76	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
				77	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12),
				78	};
				79	const int32x4_t shifts[2] = {
				80	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 16),
				81	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 20),
				82	};
				83	wbptr += 24*sizeof(int32_t);
				84
				85	int16x8_t weights[KernelRows][KernelCols];
				86	for (unsigned int i = 0; i < KernelRows; i++)
				87	{
				88	for (unsigned int j = 0; j < KernelCols; j++)
				89	{
				90	const auto w = vld1_s8(wbptr);
				91	weights[i][j] = reinterpret_cast<int16x8_t>(vmovl_s8(w));
				92	wbptr += 8;
				93	}
				94	}
				95
				96	int16x8_t inputs[InnerTileRows][InnerTileCols];
				97	const uint8x8_t ioffset = vdup_n_u8(input_offset);
				98	for (unsigned int i = 0; i < InnerTileRows; i++)
				99	{
				100	for (unsigned int j = 0; j < InnerTileCols; j++)
				101	{
				102	const auto x = vld1_u8(get_input_ptr(i, j, channel));
				103	inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
				104	}
				105	}
				106
				107	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				108	{
				109	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				110	{
				111	int32x4_t accs[2];
				112	for (unsigned int i = 0; i < 2; i++)
				113	{
				114	accs[i] = biases[i];
				115	}
				116
				117	for (unsigned int wi = 0; wi < KernelRows; wi++)
				118	{
				119	for (unsigned int wj = 0; wj < KernelCols; wj++)
				120	{
				121	const auto w = weights[wi][wj];
				122	const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
				123	accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
				124	accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
				125	}
				126	}
				127
				128	int32x4_t final_accs[2];
				129	for (unsigned int i = 0; i < 2; i++)
				130	{
				131	const int32x4_t y = rounding_divide_by_exp2(
				132	saturating_doubling_high_mul(accs[i], multipliers[i]),
				133	shifts[i]);
				134	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
				135	final_accs[i] = vaddq_s32(y, offset);
				136	final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
				137	final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
				138	}
				139
				140	const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
				141	vreinterpretq_s16_s32(final_accs[1]));
				142	const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
				143	const uint8x8_t output =
				144	vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
				145
				146	vst1_u8(get_output_ptr(oi, oj, channel), output);
				147	}
				148	}
				149	}
				150
				151	for (; n_channels; n_channels--, channel++)
				152	{
				153	// Load bias
				154	const int32_t bias = reinterpret_cast<const int32_t >(wbptr);
				155	const int32_t multiplier = reinterpret_cast<const int32_t >(wbptr + sizeof(int32_t));
				156	const int32_t shift = reinterpret_cast<const int32_t >(wbptr + 2*sizeof(int32_t));
				157
				158	wbptr += 3*sizeof(int32_t);
				159
				160	// Load weights
				161	int16_t weights[KernelRows][KernelCols];
				162	for (unsigned int i = 0; i < KernelRows; i++)
				163	{
				164	for (unsigned int j = 0; j < KernelCols; j++)
				165	{
				166	weights[i][j] = *(wbptr++);
				167	}
				168	}
				169
				170	// Load the input activations
				171	int16_t inputs[InnerTileRows][InnerTileCols];
				172	for (unsigned int i = 0; i < InnerTileRows; i++)
				173	{
				174	for (unsigned int j = 0; j < InnerTileCols; j++)
				175	{
				176	inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
				177	}
				178	}
				179
				180	// Perform the convolution
				181	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				182	{
				183	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				184	{
				185	int32_t acc = bias;
				186
				187	for (unsigned int wi = 0; wi < KernelRows; wi++)
				188	{
				189	for (unsigned int wj = 0; wj < KernelCols; wj++)
				190	{
				191	const auto w = weights[wi][wj], x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				192	acc += w * x;
				193	}
				194	}
				195
				196	// Requantize
				197	acc = rounding_divide_by_exp2(
				198	saturating_doubling_high_mul(acc, multiplier),
				199	-shift);
				200	acc += output_offset;
				201	acc = std::max(acc, clamp_min);
				202	acc = std::min(acc, clamp_max);
				203	uint8_t output = static_cast<uint8_t>(acc);
				204	*(get_output_ptr(oi, oj, channel)) = output;
				205	}
				206	}
				207	}
				208	}
				209
				210	template <
				211	unsigned int OutputTileRows, unsigned int OutputTileCols,
				212	unsigned int KernelRows, unsigned int KernelCols,
				213	unsigned int StrideRows, unsigned int StrideCols,
				214	typename FInput, typename FOutput
				215	>
				216	static inline void execute_tilefn_hybrid(
				217	int n_channels,
				218	const void* packed_params,
				219	const ActivationFunction actfn,
				220	const qasymm8::QAsymm8Params &input_quant,
				221	const qasymm8::QAsymm8Params &output_quant,
				222	FInput &get_input_ptr,
				223	FOutput &get_output_ptr) {
				224
				225	// Compute min/max clamp values
				226	int32_t clamp_min = std::numeric_limits<uint8_t>::min();
				227	int32_t clamp_max = std::numeric_limits<uint8_t>::max();
				228
				229	if (actfn == ActivationFunction::ReLU) {
				230	clamp_min = output_quant.offset;
				231	}
				232
				233	// Disabling Relu6 for now
				234	if (actfn == ActivationFunction::ReLU6) {
				235	const int32_t top_rail = output_quant.quantize(6.0f);
				236	clamp_max = std::min(clamp_max, top_rail);
				237	}
				238
				239	// Call the tile execution method
				240	tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
				241	StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr, clamp_min, clamp_max, input_quant.offset, output_quant.offset);
				242	}
				243	}
				244
				245
				246
				247	namespace depthwise {
				248	using namespace qsymm8;
				249	template <
				250	unsigned int OutputTileRows, unsigned int OutputTileCols,
				251	unsigned int KernelRows, unsigned int KernelCols,
				252	unsigned int StrideRows, unsigned int StrideCols
				253	>
				254	QSymm8HybridPerChannelDepthwiseConvolution<
				255	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				256	>::QSymm8HybridPerChannelDepthwiseConvolution(
				257	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				258	const ActivationFunction activation,
				259	const QSymm8PerChannelParams& weight_quantisation,
				260	const qasymm8::QAsymm8Params& input_quantisation,
				261	const qasymm8::QAsymm8Params& output_quantisation,
				262	unsigned int padding_top,
				263	unsigned int padding_left,
				264	unsigned int padding_bottom,
				265	unsigned int padding_right
				266	) : QSymm8HybridPerChannelDepthwiseConvolution(
				267	n_batches, n_input_rows, n_input_cols, n_channels,
				268	activation, weight_quantisation, input_quantisation, output_quantisation,
				269	QSymm8PerChannelRescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
				270	padding_top, padding_left, padding_bottom, padding_right
				271	)
				272	{
				273	}
				274
				275	template <
				276	unsigned int OutputTileRows, unsigned int OutputTileCols,
				277	unsigned int KernelRows, unsigned int KernelCols,
				278	unsigned int StrideRows, unsigned int StrideCols
				279	>
				280	QSymm8HybridPerChannelDepthwiseConvolution<
				281	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				282	>::QSymm8HybridPerChannelDepthwiseConvolution(
				283	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				284	const ActivationFunction activation,
				285	const QSymm8PerChannelParams& weight_quantisation,
				286	const qasymm8::QAsymm8Params& input_quantisation,
				287	const qasymm8::QAsymm8Params& output_quantisation,
				288	const QSymm8PerChannelRescaleParams& rescale_params,
				289	unsigned int padding_top,
				290	unsigned int padding_left,
				291	unsigned int padding_bottom,
				292	unsigned int padding_right
				293	) : Base(
				294	n_batches, n_input_rows, n_input_cols, n_channels, activation,
				295	padding_top, padding_left, padding_bottom, padding_right
				296	),
				297	_weights_quant(weight_quantisation),
				298	_input_quant(input_quantisation),
				299	_output_quant(output_quantisation),
				300	_rescale_parameters(rescale_params)
				301	{
				302	}
				303
				304	template <
				305	unsigned int OutputTileRows, unsigned int OutputTileCols,
				306	unsigned int KernelRows, unsigned int KernelCols,
				307	unsigned int StrideRows, unsigned int StrideCols
				308	>
				309	uint8_t QSymm8HybridPerChannelDepthwiseConvolution<
				310	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				311	>::_input_padding_value(void) const
				312	{
				313	return _input_quant.offset;
				314	}
				315
				316	template <
				317	unsigned int OutputTileRows, unsigned int OutputTileCols,
				318	unsigned int KernelRows, unsigned int KernelCols,
				319	unsigned int StrideRows, unsigned int StrideCols
				320	>
				321	void QSymm8HybridPerChannelDepthwiseConvolution<
				322	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				323	>::_pack_params(
				324	void * const buffer,
				325	const void * const weights,
				326	const unsigned int weight_row_stride,
				327	const unsigned int weight_col_stride,
				328	const void * const biases
				329	) const
				330	{
				331	const int8_t wptr = static_cast<const int8_t >(weights);
				332	const int32_t bptr = static_cast<const int32_t >(biases);
				333	const int32_t mptr = static_cast<const int32_t >(_rescale_parameters.multipliers.data());
				334	const int32_t sptr = static_cast<const int32_t >(_rescale_parameters.shifts.data());
				335	int8_t outptr = static_cast<int8_t >(buffer);
				336
				337	// We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
				338	// For SVE set this to half the vector length.
				339	unsigned int veclen = 8;
				340
				341	// While there are channels left to process, pack a vector length of them at
				342	// a time and reduce the size of vector used as the size of the tensor
				343	// decreases.
				344	for (
				345	unsigned int n_channels = this->n_channels(); n_channels;
				346	n_channels -= veclen,
				347	outptr += veclen(3sizeof(int32_t) + this->kernel_rows*this->kernel_cols)
				348	)
				349	{
				350	// NOTE Ignore this section if using SVE, the vector length remains the
				351	// same and we just don't fill a full register for the tail.
				352	while (n_channels < veclen)
				353	{
				354	// Reduce the vector length to either 8 or 1 (scalar)
				355	// TODO Support more vector lengths in `execute_tile`.
				356	veclen = (veclen == 16) ? 8 : 1;
				357	}
				358
				359	// Get pointers to bias and weight portions of the output structure.
				360	int32_t out_bptr = reinterpret_cast<int32_t >(outptr);
				361	int32_t out_mptr = reinterpret_cast<int32_t >(outptr + veclen*sizeof(int32_t));
				362	int32_t out_sptr = reinterpret_cast<int32_t >(outptr + 2veclensizeof(int32_t));
				363	int8_t out_wptr = outptr + 3veclen*sizeof(int32_t);
				364
				365	// Copy a vector length of elements
				366	for (unsigned int n = 0; n < veclen && n < n_channels; n++)
				367	{
				368	const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
				369	const int32_t multiplier = (mptr != nullptr) ? *(mptr++) : 0;
				370	const int32_t shift = (sptr != nullptr) ? *(sptr++) : 0;
				371
				372	out_bptr[n] = bias;
				373	out_mptr[n] = multiplier;
				374	out_sptr[n] = -shift;
				375
				376	for (unsigned int i = 0; i < KernelRows; i++)
				377	{
				378	int8_t row_outptr = out_wptr + iKernelCols*veclen;
				379	for (unsigned int j = 0; j < KernelCols; j++)
				380	{
				381	int8_t w = (wptr + iweight_row_stride + j*weight_col_stride);
				382	row_outptr[j*veclen + n] = w;
				383	}
				384	}
				385	wptr++;
				386	}
				387	}
				388	}
				389
				390
				391	template <
				392	unsigned int OutputTileRows, unsigned int OutputTileCols,
				393	unsigned int KernelRows, unsigned int KernelCols,
				394	unsigned int StrideRows, unsigned int StrideCols
				395	>
				396	template <ActivationFunction Activation>
				397	void QSymm8HybridPerChannelDepthwiseConvolution<
				398	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				399	>::execute_tile(
				400	int n_channels,
				401	const void* packed_params,
				402	const uint8_t* inptr,
				403	unsigned int in_row_stride,
				404	unsigned int in_col_stride,
				405	uint8_t* outptr,
				406	unsigned int out_row_stride,
				407	unsigned int out_col_stride
				408	) {
				409
				410	// Construct methods to get pointers
				411	const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
				412	const int i, const int j, const int channel) {
				413	return inptr + i * in_row_stride + j * in_col_stride + channel;
				414	};
				415
				416	const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
				417	const int i, const int j, const int channel) {
				418	return outptr + i * out_row_stride + j * out_col_stride + channel;
				419	};
				420
				421	execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
				422	StrideRows, StrideCols>(
				423	n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
				424	}
				425
				426	template <
				427	unsigned int OutputTileRows, unsigned int OutputTileCols,
				428	unsigned int KernelRows, unsigned int KernelCols,
				429	unsigned int StrideRows, unsigned int StrideCols
				430	>
				431	template <ActivationFunction Activation>
				432	void QSymm8HybridPerChannelDepthwiseConvolution<
				433	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				434	>::execute_tile(
				435	int n_channels,
				436	const void* packed_params,
				437	const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
				438	uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
				439	) {
				440	// Construct methods to get pointers
				441	const auto get_input_ptr = [inptrs](const int i, const int j,
				442	const int channel) {
				443	return inptrs[i][j] + channel;
				444	};
				445
				446	const auto get_output_ptr = [outptrs](const int i, const int j,
				447	const int channel) {
				448	return outptrs[i][j] + channel;
				449	};
				450
				451	// Call the tile execution method
				452	execute_tilefn_hybrid<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
				453	StrideRows, StrideCols>(
				454	n_channels, packed_params, Activation, _input_quant, _output_quant, get_input_ptr, get_output_ptr);
				455	}
				456
				457	} // namespace depthwise