Blame - src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp - ml/ComputeLibrary

blob: be73065b00ea786beff12a71ff5234de90c88d8a [file] [log] [blame]

Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2019 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
				33	#include <limits>
				34
				35	#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
				36	#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
				37	#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
				38
				39	#pragma once
				40
				41	// Comment the following to use floating-point based quantisation, leave
				42	// uncommented to use fixed-point.
				43	#define FIXED_POINT_REQUANTISATION 1
				44
				45	using namespace neon_convolution_kernels;
				46	using namespace qasymm8;
				47
				48	template <typename T>
				49	struct clamp_to_limits
				50	{
				51	template <typename U>
				52	static inline U clamp(const U& v)
				53	{
				54	const std::numeric_limits<T> limits;
				55	const U min = static_cast<U>(limits.min());
				56	const U max = static_cast<U>(limits.max());
				57	return std::min(std::max(v, min), max);
				58	}
				59
				60	template <typename U>
				61	static inline T clamp_and_cast(const U& v)
				62	{
				63	return static_cast<U>(clamp(v));
				64	}
				65	};
				66
				67	template <typename T>
				68	inline T saturating_doubling_high_mul(const T&, const int32_t&);
				69
				70	template <>
				71	inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
				72	{
				73	return vqrdmulhq_n_s32(a, b);
				74	}
				75
				76	template <>
				77	inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
				78	{
				79	return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
				80	}
				81
				82	template <typename T>
				83	inline T rounding_divide_by_exp2(const T& x, const int exponent);
				84
				85	template <>
				86	inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
				87	{
				88	const int32x4_t shift = vdupq_n_s32(-exponent);
				89	const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
				90	const int32x4_t fixed = vqaddq_s32(x, fixup);
				91	return vrshlq_s32(fixed, shift);
				92	}
				93
				94	template <>
				95	inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
				96	{
				97	const int32x2_t shift = vdup_n_s32(-exponent);
				98	const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
				99	const int32x2_t fixed = vqadd_s32(x, fixup);
				100	return vrshl_s32(fixed, shift);
				101	}
				102
				103	template <>
				104	inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
				105	{
				106	const int32x2_t xs = vdup_n_s32(x);
				107	return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
				108	}
				109
				110	namespace depthwise
				111	{
				112	template <
				113	unsigned int OutputTileRows, unsigned int OutputTileCols,
				114	unsigned int KernelRows, unsigned int KernelCols,
				115	unsigned int StrideRows, unsigned int StrideCols
				116	>
				117	QAsymm8DepthwiseConvolution<
				118	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				119	>::QAsymm8DepthwiseConvolution(
				120	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				121	const ActivationFunction activation,
				122	const QAsymm8Params& weight_quantisation,
				123	const QAsymm8Params& input_quantisation,
				124	const QAsymm8Params& output_quantisation,
				125	unsigned int padding_top,
				126	unsigned int padding_left,
				127	unsigned int padding_bottom,
				128	unsigned int padding_right
				129	) : QAsymm8DepthwiseConvolution(
				130	n_batches, n_input_rows, n_input_cols, n_channels,
				131	activation, weight_quantisation, input_quantisation, output_quantisation,
				132	QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
				133	padding_top, padding_left, padding_bottom, padding_right
				134	)
				135	{
				136	}
				137
				138	template <
				139	unsigned int OutputTileRows, unsigned int OutputTileCols,
				140	unsigned int KernelRows, unsigned int KernelCols,
				141	unsigned int StrideRows, unsigned int StrideCols
				142	>
				143	QAsymm8DepthwiseConvolution<
				144	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				145	>::QAsymm8DepthwiseConvolution(
				146	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				147	const ActivationFunction activation,
				148	const QAsymm8Params& weight_quantisation,
				149	const QAsymm8Params& input_quantisation,
				150	const QAsymm8Params& output_quantisation,
				151	const QAsymm8RescaleParams& rescale_params,
				152	unsigned int padding_top,
				153	unsigned int padding_left,
				154	unsigned int padding_bottom,
				155	unsigned int padding_right
				156	) : Base(
				157	n_batches, n_input_rows, n_input_cols, n_channels,
				158	get_activation_fn(activation, output_quantisation),
				159	padding_top, padding_left, padding_bottom, padding_right
				160	),
				161	_weights_quant(weight_quantisation),
				162	_inputs_quant(input_quantisation),
				163	_output_quant(output_quantisation),
				164	rescale_parameters(rescale_params)
				165	{
				166	}
				167
				168	template <
				169	unsigned int OutputTileRows, unsigned int OutputTileCols,
				170	unsigned int KernelRows, unsigned int KernelCols,
				171	unsigned int StrideRows, unsigned int StrideCols
				172	>
				173	ActivationFunction QAsymm8DepthwiseConvolution<
				174	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				175	>::get_activation_fn(
				176	const ActivationFunction activation,
				177	const QAsymm8Params& output_quant
				178	)
				179	{
				180	if (
				181	(activation == ActivationFunction::ReLU &&
				182	output_quant.quantize(0) == 0) \|\|
				183	(activation == ActivationFunction::ReLU6 &&
				184	output_quant.quantize(0) == 0 &&
				185	output_quant.dequantize(255) <= 6.0f)
				186	)
				187	{
				188	// If the range of values which can be represented by a quantized value are
				189	// within the range that would be produced by the activation function, then
				190	// the activation function is redundant and can be skipped.
				191	return ActivationFunction::None;
				192	}
				193	else if(
				194	activation == ActivationFunction::ReLU6 &&
				195	output_quant.dequantize(255) <= 6.0f
				196	)
				197	{
				198	// If the largest value that can be represented by a quantized value is
				199	// lower than the upper boundary, then the activation function can be
				200	// relaxed to a ReLU.
				201	return ActivationFunction::ReLU;
				202	}
				203
				204	return activation;
				205	}
				206
				207	template <
				208	unsigned int OutputTileRows, unsigned int OutputTileCols,
				209	unsigned int KernelRows, unsigned int KernelCols,
				210	unsigned int StrideRows, unsigned int StrideCols
				211	>
				212	uint8_t QAsymm8DepthwiseConvolution<
				213	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				214	>::_input_padding_value(void) const
				215	{
				216	return _inputs_quant.offset;
				217	}
				218
				219	template <
				220	unsigned int OutputTileRows, unsigned int OutputTileCols,
				221	unsigned int KernelRows, unsigned int KernelCols,
				222	unsigned int StrideRows, unsigned int StrideCols
				223	>
				224	void QAsymm8DepthwiseConvolution<
				225	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				226	>::_pack_params(
				227	void * const buffer,
				228	const void * const weights,
				229	const unsigned int weight_row_stride,
				230	const unsigned int weight_col_stride,
				231	const void * const biases
				232	) const
				233	{
				234	const uint8_t wptr = static_cast<const uint8_t >(weights);
				235	const int32_t bptr = static_cast<const int32_t >(biases);
				236	uint8_t outptr = static_cast<uint8_t >(buffer);
				237
				238	// We set the vector length to use quad registers on Aarch64 and only doubles
				239	// on Aarch32. NOTE For SVE set this to the actual vector length.
				240	#if defined(__aarch64__)
				241	unsigned int veclen = 16;
				242	#else
				243	#if defined(__arm__)
				244	unsigned int veclen = 8;
				245	#endif
				246	#endif
				247
				248	// Compute the rank 0 offset arising from the quantisation parameters.
				249	const int32_t rank0_offset = (KernelRows * KernelCols *
				250	static_cast<int32_t>(_weights_quant.offset) *
				251	static_cast<int32_t>(_inputs_quant.offset));
				252
				253	// While there are channels left to process, pack a vector length of them at
				254	// a time and reduce the size of vector used as the size of the tensor
				255	// decreases.
				256	for (
				257	unsigned int n_channels = this->n_channels(); n_channels;
				258	n_channels -= veclen,
				259	outptr += veclen(sizeof(int32_t) + this->kernel_rowsthis->kernel_cols)
				260	)
				261	{
				262	// NOTE Ignore this section if using SVE, the vector length remains the
				263	// same and we just don't fill a full register for the tail.
				264	while (n_channels < veclen)
				265	{
				266	// Reduce the vector length to either 8 or 1 (scalar)
				267	// TODO Support more vector lengths in `execute_tile`.
				268	veclen = (veclen == 16) ? 8 : 1;
				269	}
				270
				271	// Get pointers to bias and weight portions of the output structure.
				272	int32_t out_bptr = reinterpret_cast<int32_t >(outptr);
				273	uint8_t out_wptr = outptr + veclensizeof(int32_t);
				274
				275	// Copy a vector length of elements
				276	for (unsigned int n = 0; n < veclen && n < n_channels; n++)
				277	{
				278	int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
				279	uint32_t weight_sum = 0;
				280
				281	for (unsigned int i = 0; i < KernelRows; i++)
				282	{
				283	uint8_t row_outptr = out_wptr + iKernelCols*veclen;
				284	for (unsigned int j = 0; j < KernelCols; j++)
				285	{
				286	uint8_t w = (wptr + iweight_row_stride + j*weight_col_stride);
				287	row_outptr[j*veclen + n] = w;
				288	weight_sum += static_cast<uint32_t>(w);
				289	}
				290	}
				291	wptr++;
				292
				293	// Include in the bias contributions from the quantisation offset
				294	int32_t rank1_offset = static_cast<int32_t>(
				295	static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
				296	);
				297	out_bptr[n] = bias + rank0_offset - rank1_offset;
				298	}
				299	}
				300	}
				301
				302	template <
				303	unsigned int OutputTileRows, unsigned int OutputTileCols,
				304	unsigned int KernelRows, unsigned int KernelCols,
				305	unsigned int StrideRows, unsigned int StrideCols
				306	>
				307	template<ActivationFunction Activation>
				308	void QAsymm8DepthwiseConvolution<
				309	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				310	>::execute_tile(
				311	int n_channels,
				312	const void* packed_params,
				313	const uint8_t* inptr,
				314	const unsigned int in_row_stride,
				315	const unsigned int in_col_stride,
				316	uint8_t* outptr,
				317	const unsigned int out_row_stride,
				318	const unsigned int out_col_stride
				319	)
				320	{
				321	// Activation parameters (unused if Activation is None)
				322	const uint8_t aqmin = _output_quant.offset;
				323	const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
				324	std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
				325
				326	// Byte type pointer to weights and biases
				327	const uint8_t wbptr = static_cast<const uint8_t >(packed_params);
				328
				329	#if defined(__aarch64__) // Under Aarch64 only use quad registers
				330	for (; n_channels >= 16; n_channels -= 16)
				331	{
				332	// Load biases
				333	const int32x4_t biases[4] = {
				334	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				335	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				336	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
				337	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
				338	};
				339	wbptr += 16*sizeof(int32_t);
				340
				341	// Load weights
				342	uint8x16_t weights[KernelRows][KernelCols];
				343	for (unsigned int i = 0; i < KernelRows; i++)
				344	{
				345	for (unsigned int j = 0; j < KernelCols; j++)
				346	{
				347	weights[i][j] = vld1q_u8(wbptr);
				348	wbptr += 16;
				349	}
				350	}
				351
				352	// Load the input activations
				353	uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				354	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				355	{
				356	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				357	{
				358	inputs[i][j] = vld1q_u8(inptr + iin_row_stride + jin_col_stride);
				359	}
				360	}
				361	inptr += 16;
				362
				363	// Perform the convolution
				364	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				365	{
				366	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				367	{
				368	// Two sets of operations are required, we perform the
				369	// multiply-accumulates for the convolution proper but must also sum
				370	// the tile elements to account for the _weight_ offset.
				371	uint32x4_t accs[4];
				372	for (unsigned int i = 0; i < 4; i++)
				373	{
				374	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				375	}
				376
				377	for (unsigned int wi = 0; wi < KernelRows; wi++)
				378	{
				379	for (unsigned int wj = 0; wj < KernelCols; wj++)
				380	{
				381	// Get relevant weight and activation pixel
				382	const uint8x16_t w = weights[wi][wj];
				383	const uint8x16_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				384
				385	// Perform multiplication and accumulation
				386	const uint16x8_t muls[2] = {
				387	vmull_u8(vget_low_u8(w), vget_low_u8(x)),
				388	vmull_u8(vget_high_u8(w), vget_high_u8(x))
				389	};
				390
				391	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				392	const uint16x8_t sum_elems[2] = {
				393	vmull_u8(vget_low_u8(x), woffset),
				394	vmull_u8(vget_high_u8(x), woffset)
				395	};
				396
				397	const uint32x4_t tmps[4] = {
				398	vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
				399	vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
				400	vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
				401	vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
				402	};
				403	for (unsigned int i = 0; i < 4; i++)
				404	{
				405	accs[i] = vaddq_u32(accs[i], tmps[i]);
				406	}
				407	}
				408	}
				409
				410	// Rescale the accumulator and add in the new offset.
				411	uint32x4_t final_accs[4];
				412	for (unsigned int i = 0; i < 4; i++)
				413	{
				414	#ifdef FIXED_POINT_REQUANTISATION
				415	const int32x4_t y = rounding_divide_by_exp2(
				416	saturating_doubling_high_mul(
				417	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				418	),
				419	rescale_parameters.shift
				420	);
				421	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				422	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				423	#else // floating point requantisation
				424	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				425	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				426	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				427	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				428	final_accs[i] = vcvtq_u32_f32(fp_acc);
				429	#endif
				430	}
				431
				432	uint8x16_t output = vcombine_u8(
				433	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
				434	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
				435	);
				436
				437	// Apply the activation function
				438	if (Activation == ActivationFunction::ReLU \|\|
				439	Activation == ActivationFunction::ReLU6)
				440	{
				441	output = vmaxq_u8(output, vdupq_n_u8(aqmin));
				442	}
				443	if (Activation == ActivationFunction::ReLU6)
				444	{
				445	output = vminq_u8(output, vdupq_n_u8(aqmax));
				446	}
				447
				448	vst1q_u8(outptr + oiout_row_stride + ojout_col_stride, output);
				449	}
				450	}
				451	outptr += 16;
				452	}
				453	#endif // defined(__aarch64__)
				454	for (; n_channels >= 8; n_channels -= 8)
				455	{
				456	const int32x4_t biases[2] = {
				457	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				458	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				459	};
				460	wbptr += 8*sizeof(int32_t);
				461
				462	uint8x8_t weights[KernelRows][KernelCols];
				463	for (unsigned int i = 0; i < KernelRows; i++)
				464	{
				465	for (unsigned int j = 0; j < KernelCols; j++)
				466	{
				467	weights[i][j] = vld1_u8(wbptr);
				468	wbptr += 8;
				469	}
				470	}
				471
				472	uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				473	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				474	{
				475	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				476	{
				477	inputs[i][j] = vld1_u8(inptr + iin_row_stride + jin_col_stride);
				478	}
				479	}
				480	inptr += 8;
				481
				482	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				483	{
				484	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				485	{
				486	uint32x4_t accs[2];
				487	for (unsigned int i = 0; i < 2; i++)
				488	{
				489	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				490	}
				491
				492	for (unsigned int wi = 0; wi < KernelRows; wi++)
				493	{
				494	for (unsigned int wj = 0; wj < KernelCols; wj++)
				495	{
				496	const uint8x8_t w = weights[wi][wj];
				497	const uint8x8_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				498
				499	const uint16x8_t muls = vmull_u8(w, x);
				500	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				501	const uint16x8_t sum_elems = vmull_u8(x, woffset);
				502
				503	const uint32x4_t tmps[2] = {
				504	vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
				505	vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
				506	};
				507	for (unsigned int i = 0; i < 2; i++)
				508	{
				509	accs[i] = vaddq_u32(accs[i], tmps[i]);
				510	}
				511	}
				512	}
				513
				514	uint32x4_t final_accs[2];
				515	for (unsigned int i = 0; i < 2; i++)
				516	{
				517	#ifdef FIXED_POINT_REQUANTISATION
				518	const int32x4_t y = rounding_divide_by_exp2(
				519	saturating_doubling_high_mul(
				520	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				521	),
				522	rescale_parameters.shift
				523	);
				524	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				525	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				526	#else // floating point requantisation
				527	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				528	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				529	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				530	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				531	final_accs[i] = vcvtq_u32_f32(fp_acc);
				532	#endif
				533	}
				534
				535	uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
				536
				537	// Apply the activation function
				538	if (Activation == ActivationFunction::ReLU \|\|
				539	Activation == ActivationFunction::ReLU6)
				540	{
				541	output = vmax_u8(output, vdup_n_u8(aqmin));
				542	}
				543	if (Activation == ActivationFunction::ReLU6)
				544	{
				545	output = vmin_u8(output, vdup_n_u8(aqmax));
				546	}
				547
				548	vst1_u8(outptr + oiout_row_stride + ojout_col_stride, output);
				549	}
				550	}
				551	outptr += 8;
				552	}
				553	for (; n_channels; n_channels--)
				554	{
				555	// Load bias
				556	const int32_t bias = reinterpret_cast<const int32_t >(wbptr);
				557	wbptr += sizeof(int32_t);
				558
				559	// Load weights
				560	uint8_t weights[KernelRows][KernelCols];
				561	for (unsigned int i = 0; i < KernelRows; i++)
				562	{
				563	for (unsigned int j = 0; j < KernelCols; j++)
				564	{
				565	weights[i][j] = *(wbptr++);
				566	}
				567	}
				568
				569	// Load the input activations
				570	uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				571	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				572	{
				573	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				574	{
				575	inputs[i][j] = (inptr + iin_row_stride + j*in_col_stride);
				576	}
				577	}
				578	inptr++;
				579
				580	// Perform the convolution
				581	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				582	{
				583	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				584	{
				585	int32_t acc = bias;
				586	uint32_t element_sum = 0;
				587
				588	for (unsigned int wi = 0; wi < KernelRows; wi++)
				589	{
				590	for (unsigned int wj = 0; wj < KernelCols; wj++)
				591	{
				592	const auto w = weights[wi][wj], x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				593	acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
				594	element_sum += static_cast<uint32_t>(x);
				595	}
				596	}
				597
				598	acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
				599
				600	// Requantize
				601	#ifdef FIXED_POINT_REQUANTISATION
				602	acc = rounding_divide_by_exp2(
				603	saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
				604	rescale_parameters.shift
				605	);
				606	acc += _output_quant.offset;
				607	uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
				608	#else // floating point requantization
				609	float fp_acc = static_cast<float>(acc);
				610	fp_acc *= rescale_parameters.rescale;
				611	fp_acc += static_cast<float>(_output_quant.offset);
				612	fp_acc = std::max<float>(fp_acc, 0.0f);
				613	uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
				614	#endif
				615
				616	// Apply the activation function
				617	if (Activation == ActivationFunction::ReLU \|\|
				618	Activation == ActivationFunction::ReLU6)
				619	{
				620	output = std::max(output, aqmin);
				621	}
				622	if (Activation == ActivationFunction::ReLU6)
				623	{
				624	output = std::min(output, aqmax);
				625	}
				626
				627	(outptr + oiout_row_stride + oj*out_col_stride) = output;
				628	}
				629	}
				630	outptr++;
				631	}
				632	}
				633
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame^]	634	template <
				635	unsigned int OutputTileRows, unsigned int OutputTileCols,
				636	unsigned int KernelRows, unsigned int KernelCols,
				637	unsigned int StrideRows, unsigned int StrideCols
				638	>
				639	template<ActivationFunction Activation>
				640	void QAsymm8DepthwiseConvolution<
				641	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				642	>::execute_tile(
				643	int n_channels,
				644	const void* packed_params,
				645	const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
				646	uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
				647	)
				648	{
				649	// Activation parameters (unused if Activation is None)
				650	const uint8_t aqmin = _output_quant.offset;
				651	const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
				652	std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
				653
				654	// Byte type pointer to weights and biases
				655	const uint8_t wbptr = static_cast<const uint8_t >(packed_params);
				656
				657	// Offset into input/output tensors
				658	int n = 0;
				659
				660	#if defined(__aarch64__) // Under Aarch64 only use quad registers
				661	for (; n_channels >= 16; n_channels -= 16, n += 16)
				662	{
				663	// Load biases
				664	const int32x4_t biases[4] = {
				665	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				666	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				667	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
				668	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
				669	};
				670	wbptr += 16*sizeof(int32_t);
				671
				672	// Load weights
				673	uint8x16_t weights[KernelRows][KernelCols];
				674	for (unsigned int i = 0; i < KernelRows; i++)
				675	{
				676	for (unsigned int j = 0; j < KernelCols; j++)
				677	{
				678	weights[i][j] = vld1q_u8(wbptr);
				679	wbptr += 16;
				680	}
				681	}
				682
				683	// Load the input activations
				684	uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				685	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				686	{
				687	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				688	{
				689	inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
				690	}
				691	}
				692
				693	// Perform the convolution
				694	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				695	{
				696	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				697	{
				698	// Two sets of operations are required, we perform the
				699	// multiply-accumulates for the convolution proper but must also sum
				700	// the tile elements to account for the _weight_ offset.
				701	uint32x4_t accs[4];
				702	for (unsigned int i = 0; i < 4; i++)
				703	{
				704	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				705	}
				706
				707	for (unsigned int wi = 0; wi < KernelRows; wi++)
				708	{
				709	for (unsigned int wj = 0; wj < KernelCols; wj++)
				710	{
				711	// Get relevant weight and activation pixel
				712	const uint8x16_t w = weights[wi][wj];
				713	const uint8x16_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				714
				715	// Perform multiplication and accumulation
				716	const uint16x8_t muls[2] = {
				717	vmull_u8(vget_low_u8(w), vget_low_u8(x)),
				718	vmull_u8(vget_high_u8(w), vget_high_u8(x))
				719	};
				720
				721	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				722	const uint16x8_t sum_elems[2] = {
				723	vmull_u8(vget_low_u8(x), woffset),
				724	vmull_u8(vget_high_u8(x), woffset)
				725	};
				726
				727	const uint32x4_t tmps[4] = {
				728	vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
				729	vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
				730	vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
				731	vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
				732	};
				733	for (unsigned int i = 0; i < 4; i++)
				734	{
				735	accs[i] = vaddq_u32(accs[i], tmps[i]);
				736	}
				737	}
				738	}
				739
				740	// Rescale the accumulator and add in the new offset.
				741	uint32x4_t final_accs[4];
				742	for (unsigned int i = 0; i < 4; i++)
				743	{
				744	#ifdef FIXED_POINT_REQUANTISATION
				745	const int32x4_t y = rounding_divide_by_exp2(
				746	saturating_doubling_high_mul(
				747	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				748	),
				749	rescale_parameters.shift
				750	);
				751	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				752	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				753	#else // floating point requantisation
				754	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				755	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				756	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				757	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				758	final_accs[i] = vcvtq_u32_f32(fp_acc);
				759	#endif
				760	}
				761
				762	uint8x16_t output = vcombine_u8(
				763	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
				764	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
				765	);
				766
				767	// Apply the activation function
				768	if (Activation == ActivationFunction::ReLU \|\|
				769	Activation == ActivationFunction::ReLU6)
				770	{
				771	output = vmaxq_u8(output, vdupq_n_u8(aqmin));
				772	}
				773	if (Activation == ActivationFunction::ReLU6)
				774	{
				775	output = vminq_u8(output, vdupq_n_u8(aqmax));
				776	}
				777
				778	vst1q_u8(outptrs[oi][oj] + n, output);
				779	}
				780	}
				781	}
				782	#endif // defined(__aarch64__)
				783	for (; n_channels >= 8; n_channels -= 8, n += 8)
				784	{
				785	const int32x4_t biases[2] = {
				786	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				787	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				788	};
				789	wbptr += 8*sizeof(int32_t);
				790
				791	uint8x8_t weights[KernelRows][KernelCols];
				792	for (unsigned int i = 0; i < KernelRows; i++)
				793	{
				794	for (unsigned int j = 0; j < KernelCols; j++)
				795	{
				796	weights[i][j] = vld1_u8(wbptr);
				797	wbptr += 8;
				798	}
				799	}
				800
				801	uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				802	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				803	{
				804	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				805	{
				806	inputs[i][j] = vld1_u8(inptrs[i][j] + n);
				807	}
				808	}
				809
				810	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				811	{
				812	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				813	{
				814	uint32x4_t accs[2];
				815	for (unsigned int i = 0; i < 2; i++)
				816	{
				817	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				818	}
				819
				820	for (unsigned int wi = 0; wi < KernelRows; wi++)
				821	{
				822	for (unsigned int wj = 0; wj < KernelCols; wj++)
				823	{
				824	const uint8x8_t w = weights[wi][wj];
				825	const uint8x8_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				826
				827	const uint16x8_t muls = vmull_u8(w, x);
				828	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				829	const uint16x8_t sum_elems = vmull_u8(x, woffset);
				830
				831	const uint32x4_t tmps[2] = {
				832	vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
				833	vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
				834	};
				835	for (unsigned int i = 0; i < 2; i++)
				836	{
				837	accs[i] = vaddq_u32(accs[i], tmps[i]);
				838	}
				839	}
				840	}
				841
				842	uint32x4_t final_accs[2];
				843	for (unsigned int i = 0; i < 2; i++)
				844	{
				845	#ifdef FIXED_POINT_REQUANTISATION
				846	const int32x4_t y = rounding_divide_by_exp2(
				847	saturating_doubling_high_mul(
				848	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				849	),
				850	rescale_parameters.shift
				851	);
				852	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				853	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				854	#else // floating point requantisation
				855	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				856	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				857	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				858	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				859	final_accs[i] = vcvtq_u32_f32(fp_acc);
				860	#endif
				861	}
				862
				863	uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
				864
				865	// Apply the activation function
				866	if (Activation == ActivationFunction::ReLU \|\|
				867	Activation == ActivationFunction::ReLU6)
				868	{
				869	output = vmax_u8(output, vdup_n_u8(aqmin));
				870	}
				871	if (Activation == ActivationFunction::ReLU6)
				872	{
				873	output = vmin_u8(output, vdup_n_u8(aqmax));
				874	}
				875
				876	vst1_u8(outptrs[oi][oj] + n, output);
				877	}
				878	}
				879	}
				880	for (; n_channels; n_channels--, n++)
				881	{
				882	// Load bias
				883	const int32_t bias = reinterpret_cast<const int32_t >(wbptr);
				884	wbptr += sizeof(int32_t);
				885
				886	// Load weights
				887	uint8_t weights[KernelRows][KernelCols];
				888	for (unsigned int i = 0; i < KernelRows; i++)
				889	{
				890	for (unsigned int j = 0; j < KernelCols; j++)
				891	{
				892	weights[i][j] = *(wbptr++);
				893	}
				894	}
				895
				896	// Load the input activations
				897	uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				898	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				899	{
				900	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				901	{
				902	inputs[i][j] = *(inptrs[i][j] + n);
				903	}
				904	}
				905
				906	// Perform the convolution
				907	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				908	{
				909	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				910	{
				911	int32_t acc = bias;
				912	uint32_t element_sum = 0;
				913
				914	for (unsigned int wi = 0; wi < KernelRows; wi++)
				915	{
				916	for (unsigned int wj = 0; wj < KernelCols; wj++)
				917	{
				918	const auto w = weights[wi][wj], x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				919	acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
				920	element_sum += static_cast<uint32_t>(x);
				921	}
				922	}
				923
				924	acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
				925
				926	// Requantize
				927	#ifdef FIXED_POINT_REQUANTISATION
				928	acc = rounding_divide_by_exp2(
				929	saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
				930	rescale_parameters.shift
				931	);
				932	acc += _output_quant.offset;
				933	uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
				934	#else // floating point requantization
				935	float fp_acc = static_cast<float>(acc);
				936	fp_acc *= rescale_parameters.rescale;
				937	fp_acc += static_cast<float>(_output_quant.offset);
				938	fp_acc = std::max<float>(fp_acc, 0.0f);
				939	uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
				940	#endif
				941
				942	// Apply the activation function
				943	if (Activation == ActivationFunction::ReLU \|\|
				944	Activation == ActivationFunction::ReLU6)
				945	{
				946	output = std::max(output, aqmin);
				947	}
				948	if (Activation == ActivationFunction::ReLU6)
				949	{
				950	output = std::min(output, aqmax);
				951	}
				952
				953	*(outptrs[oi][oj] + n) = output;
				954	}
				955	}
				956	}
				957	}
				958
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	959	} // namespace depthwise