Blame - src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp - ml/ComputeLibrary

blob: bda875dfe123917b33bc5528bba16b0ccd590bf5 [file] [log] [blame]

Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2019 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
				33	#include <limits>
				34
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	35	#include "arm.hpp"
				36	#include "impl_base.hpp"
				37	#include "depthwise_quantized.hpp"
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	38
				39	#pragma once
				40
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	41	// Comment the following to use floating-point based quantisation, leave
				42	// uncommented to use fixed-point.
				43	#define FIXED_POINT_REQUANTISATION 1
				44
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	45	using namespace neon_convolution_kernels;
				46	using namespace qasymm8;
				47
				48	template <typename T>
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	49	struct clamp_to_limits
				50	{
				51	template <typename U>
				52	static inline U clamp(const U& v)
				53	{
				54	const std::numeric_limits<T> limits;
				55	const U min = static_cast<U>(limits.min());
				56	const U max = static_cast<U>(limits.max());
				57	return std::min(std::max(v, min), max);
				58	}
				59
				60	template <typename U>
				61	static inline T clamp_and_cast(const U& v)
				62	{
				63	return static_cast<U>(clamp(v));
				64	}
				65	};
				66
				67	template <typename T>
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	68	inline T saturating_doubling_high_mul(const T&, const int32_t&);
				69
				70	template <>
				71	inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
				72	{
				73	return vqrdmulhq_n_s32(a, b);
				74	}
				75
				76	template <>
				77	inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
				78	{
				79	return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
				80	}
				81
				82	template <typename T>
				83	inline T rounding_divide_by_exp2(const T& x, const int exponent);
				84
				85	template <>
				86	inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
				87	{
				88	const int32x4_t shift = vdupq_n_s32(-exponent);
				89	const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
				90	const int32x4_t fixed = vqaddq_s32(x, fixup);
				91	return vrshlq_s32(fixed, shift);
				92	}
				93
				94	template <>
				95	inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
				96	{
				97	const int32x2_t shift = vdup_n_s32(-exponent);
				98	const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
				99	const int32x2_t fixed = vqadd_s32(x, fixup);
				100	return vrshl_s32(fixed, shift);
				101	}
				102
				103	template <>
				104	inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
				105	{
				106	const int32x2_t xs = vdup_n_s32(x);
				107	return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
				108	}
				109
				110	namespace depthwise
				111	{
				112	template <
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	113	unsigned int OutputTileRows, unsigned int OutputTileCols,
				114	unsigned int KernelRows, unsigned int KernelCols,
				115	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	116	>
				117	QAsymm8DepthwiseConvolution<
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	118	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	119	>::QAsymm8DepthwiseConvolution(
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	120	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				121	const ActivationFunction activation,
				122	const QAsymm8Params& weight_quantisation,
				123	const QAsymm8Params& input_quantisation,
				124	const QAsymm8Params& output_quantisation,
				125	unsigned int padding_top,
				126	unsigned int padding_left,
				127	unsigned int padding_bottom,
				128	unsigned int padding_right
				129	) : QAsymm8DepthwiseConvolution(
				130	n_batches, n_input_rows, n_input_cols, n_channels,
				131	activation, weight_quantisation, input_quantisation, output_quantisation,
				132	QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
				133	padding_top, padding_left, padding_bottom, padding_right
				134	)
				135	{
				136	}
				137
				138	template <
				139	unsigned int OutputTileRows, unsigned int OutputTileCols,
				140	unsigned int KernelRows, unsigned int KernelCols,
				141	unsigned int StrideRows, unsigned int StrideCols
				142	>
				143	QAsymm8DepthwiseConvolution<
				144	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				145	>::QAsymm8DepthwiseConvolution(
				146	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				147	int n_output_rows, int n_output_cols,
				148	const ActivationFunction activation,
				149	const QAsymm8Params& weight_quantisation,
				150	const QAsymm8Params& input_quantisation,
				151	const QAsymm8Params& output_quantisation,
				152	unsigned int padding_top,
				153	unsigned int padding_left,
				154	unsigned int padding_bottom,
				155	unsigned int padding_right
				156	) : QAsymm8DepthwiseConvolution(
				157	n_batches, n_input_rows, n_input_cols, n_channels,
				158	n_output_rows, n_output_cols,
				159	activation, weight_quantisation, input_quantisation, output_quantisation,
				160	QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
				161	padding_top, padding_left, padding_bottom, padding_right
				162	)
				163	{
				164	}
				165
				166	template <
				167	unsigned int OutputTileRows, unsigned int OutputTileCols,
				168	unsigned int KernelRows, unsigned int KernelCols,
				169	unsigned int StrideRows, unsigned int StrideCols
				170	>
				171	QAsymm8DepthwiseConvolution<
				172	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				173	>::QAsymm8DepthwiseConvolution(
				174	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				175	const ActivationFunction activation,
				176	const QAsymm8Params& weight_quantisation,
				177	const QAsymm8Params& input_quantisation,
				178	const QAsymm8Params& output_quantisation,
				179	const QAsymm8RescaleParams& rescale_params,
				180	unsigned int padding_top,
				181	unsigned int padding_left,
				182	unsigned int padding_bottom,
				183	unsigned int padding_right
				184	) : Base(
				185	n_batches, n_input_rows, n_input_cols, n_channels,
				186	get_activation_fn(activation, output_quantisation),
				187	padding_top, padding_left, padding_bottom, padding_right
				188	),
				189	_weights_quant(weight_quantisation),
				190	_inputs_quant(input_quantisation),
				191	_output_quant(output_quantisation),
				192	rescale_parameters(rescale_params)
				193	{
				194	}
				195
				196	template <
				197	unsigned int OutputTileRows, unsigned int OutputTileCols,
				198	unsigned int KernelRows, unsigned int KernelCols,
				199	unsigned int StrideRows, unsigned int StrideCols
				200	>
				201	QAsymm8DepthwiseConvolution<
				202	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				203	>::QAsymm8DepthwiseConvolution(
				204	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				205	int n_output_rows, int n_output_cols,
				206	const ActivationFunction activation,
				207	const QAsymm8Params& weight_quantisation,
				208	const QAsymm8Params& input_quantisation,
				209	const QAsymm8Params& output_quantisation,
				210	const QAsymm8RescaleParams& rescale_params,
				211	unsigned int padding_top,
				212	unsigned int padding_left,
				213	unsigned int padding_bottom,
				214	unsigned int padding_right
				215	) : Base(
				216	n_batches, n_input_rows, n_input_cols, n_channels,
				217	n_output_rows, n_output_cols,
				218	get_activation_fn(activation, output_quantisation),
				219	padding_top, padding_left, padding_bottom, padding_right
				220	),
				221	_weights_quant(weight_quantisation),
				222	_inputs_quant(input_quantisation),
				223	_output_quant(output_quantisation),
				224	rescale_parameters(rescale_params)
				225	{
				226	}
				227
				228	template <
				229	unsigned int OutputTileRows, unsigned int OutputTileCols,
				230	unsigned int KernelRows, unsigned int KernelCols,
				231	unsigned int StrideRows, unsigned int StrideCols
				232	>
				233	ActivationFunction QAsymm8DepthwiseConvolution<
				234	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				235	>::get_activation_fn(
				236	const ActivationFunction activation,
				237	const QAsymm8Params& output_quant
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	238	)
				239	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	240	if (
				241	(activation == ActivationFunction::ReLU &&
				242	output_quant.quantize(0) == 0) \|\|
				243	(activation == ActivationFunction::ReLU6 &&
				244	output_quant.quantize(0) == 0 &&
				245	output_quant.dequantize(255) <= 6.0f)
				246	)
				247	{
				248	// If the range of values which can be represented by a quantized value are
				249	// within the range that would be produced by the activation function, then
				250	// the activation function is redundant and can be skipped.
				251	return ActivationFunction::None;
				252	}
				253	else if(
				254	activation == ActivationFunction::ReLU6 &&
				255	output_quant.dequantize(255) <= 6.0f
				256	)
				257	{
				258	// If the largest value that can be represented by a quantized value is
				259	// lower than the upper boundary, then the activation function can be
				260	// relaxed to a ReLU.
				261	return ActivationFunction::ReLU;
				262	}
				263
				264	return activation;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	265	}
				266
				267	template <
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	268	unsigned int OutputTileRows, unsigned int OutputTileCols,
				269	unsigned int KernelRows, unsigned int KernelCols,
				270	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	271	>
				272	uint8_t QAsymm8DepthwiseConvolution<
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	273	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	274	>::_input_padding_value(void) const
				275	{
				276	return _inputs_quant.offset;
				277	}
				278
				279	template <
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	280	unsigned int OutputTileRows, unsigned int OutputTileCols,
				281	unsigned int KernelRows, unsigned int KernelCols,
				282	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	283	>
				284	void QAsymm8DepthwiseConvolution<
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	285	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	286	>::_pack_params(
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	287	void * const buffer,
				288	const void * const weights,
				289	const unsigned int weight_row_stride,
				290	const unsigned int weight_col_stride,
				291	const void * const biases
				292	) const
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	293	{
				294	const uint8_t wptr = static_cast<const uint8_t >(weights);
				295	const int32_t bptr = static_cast<const int32_t >(biases);
				296	uint8_t outptr = static_cast<uint8_t >(buffer);
				297
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	298	// We set the vector length to use quad registers on Aarch64 and only doubles
				299	// on Aarch32. NOTE For SVE set this to the actual vector length.
				300	#if defined(__aarch64__)
				301	unsigned int veclen = 16;
				302	#else
				303	#if defined(__arm__)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	304	unsigned int veclen = 8;
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	305	#endif
				306	#endif
				307
				308	// Compute the rank 0 offset arising from the quantisation parameters.
				309	const int32_t rank0_offset = (KernelRows * KernelCols *
				310	static_cast<int32_t>(_weights_quant.offset) *
				311	static_cast<int32_t>(_inputs_quant.offset));
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	312
				313	// While there are channels left to process, pack a vector length of them at
				314	// a time and reduce the size of vector used as the size of the tensor
				315	// decreases.
				316	for (
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	317	unsigned int n_channels = this->n_channels(); n_channels;
				318	n_channels -= veclen,
				319	outptr += veclen(sizeof(int32_t) + this->kernel_rowsthis->kernel_cols)
				320	)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	321	{
				322	// NOTE Ignore this section if using SVE, the vector length remains the
				323	// same and we just don't fill a full register for the tail.
				324	while (n_channels < veclen)
				325	{
				326	// Reduce the vector length to either 8 or 1 (scalar)
				327	// TODO Support more vector lengths in `execute_tile`.
				328	veclen = (veclen == 16) ? 8 : 1;
				329	}
				330
				331	// Get pointers to bias and weight portions of the output structure.
				332	int32_t out_bptr = reinterpret_cast<int32_t >(outptr);
				333	uint8_t out_wptr = outptr + veclensizeof(int32_t);
				334
				335	// Copy a vector length of elements
				336	for (unsigned int n = 0; n < veclen && n < n_channels; n++)
				337	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	338	int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
				339	uint32_t weight_sum = 0;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	340
				341	for (unsigned int i = 0; i < KernelRows; i++)
				342	{
				343	uint8_t row_outptr = out_wptr + iKernelCols*veclen;
				344	for (unsigned int j = 0; j < KernelCols; j++)
				345	{
				346	uint8_t w = (wptr + iweight_row_stride + j*weight_col_stride);
				347	row_outptr[j*veclen + n] = w;
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	348	weight_sum += static_cast<uint32_t>(w);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	349	}
				350	}
				351	wptr++;
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	352
				353	// Include in the bias contributions from the quantisation offset
				354	int32_t rank1_offset = static_cast<int32_t>(
				355	static_cast<uint32_t>(_inputs_quant.offset) * weight_sum
				356	);
				357	out_bptr[n] = bias + rank0_offset - rank1_offset;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	358	}
				359	}
				360	}
				361
				362	template <
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	363	unsigned int OutputTileRows, unsigned int OutputTileCols,
				364	unsigned int KernelRows, unsigned int KernelCols,
				365	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	366	>
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	367	template<ActivationFunction Activation>
				368	void QAsymm8DepthwiseConvolution<
				369	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				370	>::execute_tile(
				371	int n_channels,
				372	const void* packed_params,
				373	const uint8_t* inptr,
				374	const unsigned int in_row_stride,
				375	const unsigned int in_col_stride,
				376	uint8_t* outptr,
				377	const unsigned int out_row_stride,
				378	const unsigned int out_col_stride
				379	)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	380	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	381	// Activation parameters (unused if Activation is None)
				382	const uint8_t aqmin = _output_quant.offset;
				383	const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
				384	std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	385
				386	// Byte type pointer to weights and biases
				387	const uint8_t wbptr = static_cast<const uint8_t >(packed_params);
				388
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	389	#if defined(__aarch64__) // Under Aarch64 only use quad registers
				390	for (; n_channels >= 16; n_channels -= 16)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	391	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	392	// Load biases
				393	const int32x4_t biases[4] = {
				394	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				395	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				396	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
				397	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	398	};
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	399	wbptr += 16*sizeof(int32_t);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	400
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	401	// Load weights
				402	uint8x16_t weights[KernelRows][KernelCols];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	403	for (unsigned int i = 0; i < KernelRows; i++)
				404	{
				405	for (unsigned int j = 0; j < KernelCols; j++)
				406	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	407	weights[i][j] = vld1q_u8(wbptr);
				408	wbptr += 16;
				409	}
				410	}
				411
				412	// Load the input activations
				413	uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				414	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				415	{
				416	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				417	{
				418	inputs[i][j] = vld1q_u8(inptr + iin_row_stride + jin_col_stride);
				419	}
				420	}
				421	inptr += 16;
				422
				423	// Perform the convolution
				424	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				425	{
				426	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				427	{
				428	// Two sets of operations are required, we perform the
				429	// multiply-accumulates for the convolution proper but must also sum
				430	// the tile elements to account for the _weight_ offset.
				431	uint32x4_t accs[4];
				432	for (unsigned int i = 0; i < 4; i++)
				433	{
				434	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				435	}
				436
				437	for (unsigned int wi = 0; wi < KernelRows; wi++)
				438	{
				439	for (unsigned int wj = 0; wj < KernelCols; wj++)
				440	{
				441	// Get relevant weight and activation pixel
				442	const uint8x16_t w = weights[wi][wj];
				443	const uint8x16_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				444
				445	// Perform multiplication and accumulation
				446	const uint16x8_t muls[2] = {
				447	vmull_u8(vget_low_u8(w), vget_low_u8(x)),
				448	vmull_u8(vget_high_u8(w), vget_high_u8(x))
				449	};
				450
				451	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				452	const uint16x8_t sum_elems[2] = {
				453	vmull_u8(vget_low_u8(x), woffset),
				454	vmull_u8(vget_high_u8(x), woffset)
				455	};
				456
				457	const uint32x4_t tmps[4] = {
				458	vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
				459	vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
				460	vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
				461	vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
				462	};
				463	for (unsigned int i = 0; i < 4; i++)
				464	{
				465	accs[i] = vaddq_u32(accs[i], tmps[i]);
				466	}
				467	}
				468	}
				469
				470	// Rescale the accumulator and add in the new offset.
				471	uint32x4_t final_accs[4];
				472	for (unsigned int i = 0; i < 4; i++)
				473	{
				474	#ifdef FIXED_POINT_REQUANTISATION
				475	const int32x4_t y = rounding_divide_by_exp2(
				476	saturating_doubling_high_mul(
				477	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				478	),
				479	rescale_parameters.shift
				480	);
				481	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				482	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				483	#else // floating point requantisation
				484	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				485	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				486	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				487	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				488	final_accs[i] = vcvtq_u32_f32(fp_acc);
				489	#endif
				490	}
				491
				492	uint8x16_t output = vcombine_u8(
				493	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
				494	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
				495	);
				496
				497	// Apply the activation function
				498	if (Activation == ActivationFunction::ReLU \|\|
				499	Activation == ActivationFunction::ReLU6)
				500	{
				501	output = vmaxq_u8(output, vdupq_n_u8(aqmin));
				502	}
				503	if (Activation == ActivationFunction::ReLU6)
				504	{
				505	output = vminq_u8(output, vdupq_n_u8(aqmax));
				506	}
				507
				508	vst1q_u8(outptr + oiout_row_stride + ojout_col_stride, output);
				509	}
				510	}
				511	outptr += 16;
				512	}
				513	#endif // defined(__aarch64__)
				514	for (; n_channels >= 8; n_channels -= 8)
				515	{
				516	const int32x4_t biases[2] = {
				517	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				518	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				519	};
				520	wbptr += 8*sizeof(int32_t);
				521
				522	uint8x8_t weights[KernelRows][KernelCols];
				523	for (unsigned int i = 0; i < KernelRows; i++)
				524	{
				525	for (unsigned int j = 0; j < KernelCols; j++)
				526	{
				527	weights[i][j] = vld1_u8(wbptr);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	528	wbptr += 8;
				529	}
				530	}
				531
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	532	uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				533	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	534	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	535	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	536	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	537	inputs[i][j] = vld1_u8(inptr + iin_row_stride + jin_col_stride);
				538	}
				539	}
				540	inptr += 8;
				541
				542	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				543	{
				544	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				545	{
				546	uint32x4_t accs[2];
				547	for (unsigned int i = 0; i < 2; i++)
				548	{
				549	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				550	}
				551
				552	for (unsigned int wi = 0; wi < KernelRows; wi++)
				553	{
				554	for (unsigned int wj = 0; wj < KernelCols; wj++)
				555	{
				556	const uint8x8_t w = weights[wi][wj];
				557	const uint8x8_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				558
				559	const uint16x8_t muls = vmull_u8(w, x);
				560	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				561	const uint16x8_t sum_elems = vmull_u8(x, woffset);
				562
				563	const uint32x4_t tmps[2] = {
				564	vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
				565	vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
				566	};
				567	for (unsigned int i = 0; i < 2; i++)
				568	{
				569	accs[i] = vaddq_u32(accs[i], tmps[i]);
				570	}
				571	}
				572	}
				573
				574	uint32x4_t final_accs[2];
				575	for (unsigned int i = 0; i < 2; i++)
				576	{
				577	#ifdef FIXED_POINT_REQUANTISATION
				578	const int32x4_t y = rounding_divide_by_exp2(
				579	saturating_doubling_high_mul(
				580	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				581	),
				582	rescale_parameters.shift
				583	);
				584	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				585	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				586	#else // floating point requantisation
				587	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				588	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				589	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				590	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				591	final_accs[i] = vcvtq_u32_f32(fp_acc);
				592	#endif
				593	}
				594
				595	uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
				596
				597	// Apply the activation function
				598	if (Activation == ActivationFunction::ReLU \|\|
				599	Activation == ActivationFunction::ReLU6)
				600	{
				601	output = vmax_u8(output, vdup_n_u8(aqmin));
				602	}
				603	if (Activation == ActivationFunction::ReLU6)
				604	{
				605	output = vmin_u8(output, vdup_n_u8(aqmax));
				606	}
				607
				608	vst1_u8(outptr + oiout_row_stride + ojout_col_stride, output);
				609	}
				610	}
				611	outptr += 8;
				612	}
				613	for (; n_channels; n_channels--)
				614	{
				615	// Load bias
				616	const int32_t bias = reinterpret_cast<const int32_t >(wbptr);
				617	wbptr += sizeof(int32_t);
				618
				619	// Load weights
				620	uint8_t weights[KernelRows][KernelCols];
				621	for (unsigned int i = 0; i < KernelRows; i++)
				622	{
				623	for (unsigned int j = 0; j < KernelCols; j++)
				624	{
				625	weights[i][j] = *(wbptr++);
				626	}
				627	}
				628
				629	// Load the input activations
				630	uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				631	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				632	{
				633	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				634	{
				635	inputs[i][j] = (inptr + iin_row_stride + j*in_col_stride);
				636	}
				637	}
				638	inptr++;
				639
				640	// Perform the convolution
				641	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				642	{
				643	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				644	{
				645	int32_t acc = bias;
				646	uint32_t element_sum = 0;
				647
				648	for (unsigned int wi = 0; wi < KernelRows; wi++)
				649	{
				650	for (unsigned int wj = 0; wj < KernelCols; wj++)
				651	{
				652	const auto w = weights[wi][wj], x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				653	acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
				654	element_sum += static_cast<uint32_t>(x);
				655	}
				656	}
				657
				658	acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
				659
				660	// Requantize
				661	#ifdef FIXED_POINT_REQUANTISATION
				662	acc = rounding_divide_by_exp2(
				663	saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
				664	rescale_parameters.shift
				665	);
				666	acc += _output_quant.offset;
				667	uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
				668	#else // floating point requantization
				669	float fp_acc = static_cast<float>(acc);
				670	fp_acc *= rescale_parameters.rescale;
				671	fp_acc += static_cast<float>(_output_quant.offset);
				672	fp_acc = std::max<float>(fp_acc, 0.0f);
				673	uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
				674	#endif
				675
				676	// Apply the activation function
				677	if (Activation == ActivationFunction::ReLU \|\|
				678	Activation == ActivationFunction::ReLU6)
				679	{
				680	output = std::max(output, aqmin);
				681	}
				682	if (Activation == ActivationFunction::ReLU6)
				683	{
				684	output = std::min(output, aqmax);
				685	}
				686
				687	(outptr + oiout_row_stride + oj*out_col_stride) = output;
				688	}
				689	}
				690	outptr++;
				691	}
				692	}
				693
				694	template <
				695	unsigned int OutputTileRows, unsigned int OutputTileCols,
				696	unsigned int KernelRows, unsigned int KernelCols,
				697	unsigned int StrideRows, unsigned int StrideCols
				698	>
				699	template<ActivationFunction Activation>
				700	void QAsymm8DepthwiseConvolution<
				701	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				702	>::execute_tile(
				703	int n_channels,
				704	const void* packed_params,
				705	const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
				706	uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
				707	)
				708	{
				709	// Activation parameters (unused if Activation is None)
				710	const uint8_t aqmin = _output_quant.offset;
				711	const uint8_t aqmax = (Activation == ActivationFunction::ReLU6) ?
				712	std::min<uint8_t>(255u, _output_quant.quantize(6.0f)) : 255u;
				713
				714	// Byte type pointer to weights and biases
				715	const uint8_t wbptr = static_cast<const uint8_t >(packed_params);
				716
				717	// Offset into input/output tensors
				718	int n = 0;
				719
				720	#if defined(__aarch64__) // Under Aarch64 only use quad registers
				721	for (; n_channels >= 16; n_channels -= 16, n += 16)
				722	{
				723	// Load biases
				724	const int32x4_t biases[4] = {
				725	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				726	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				727	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 8),
				728	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 12)
				729	};
				730	wbptr += 16*sizeof(int32_t);
				731
				732	// Load weights
				733	uint8x16_t weights[KernelRows][KernelCols];
				734	for (unsigned int i = 0; i < KernelRows; i++)
				735	{
				736	for (unsigned int j = 0; j < KernelCols; j++)
				737	{
				738	weights[i][j] = vld1q_u8(wbptr);
				739	wbptr += 16;
				740	}
				741	}
				742
				743	// Load the input activations
				744	uint8x16_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				745	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				746	{
				747	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				748	{
				749	inputs[i][j] = vld1q_u8(inptrs[i][j] + n);
				750	}
				751	}
				752
				753	// Perform the convolution
				754	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				755	{
				756	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				757	{
				758	// Two sets of operations are required, we perform the
				759	// multiply-accumulates for the convolution proper but must also sum
				760	// the tile elements to account for the _weight_ offset.
				761	uint32x4_t accs[4];
				762	for (unsigned int i = 0; i < 4; i++)
				763	{
				764	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
				765	}
				766
				767	for (unsigned int wi = 0; wi < KernelRows; wi++)
				768	{
				769	for (unsigned int wj = 0; wj < KernelCols; wj++)
				770	{
				771	// Get relevant weight and activation pixel
				772	const uint8x16_t w = weights[wi][wj];
				773	const uint8x16_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				774
				775	// Perform multiplication and accumulation
				776	const uint16x8_t muls[2] = {
				777	vmull_u8(vget_low_u8(w), vget_low_u8(x)),
				778	vmull_u8(vget_high_u8(w), vget_high_u8(x))
				779	};
				780
				781	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				782	const uint16x8_t sum_elems[2] = {
				783	vmull_u8(vget_low_u8(x), woffset),
				784	vmull_u8(vget_high_u8(x), woffset)
				785	};
				786
				787	const uint32x4_t tmps[4] = {
				788	vsubl_u16(vget_low_u16(muls[0]), vget_low_u16(sum_elems[0])),
				789	vsubl_u16(vget_high_u16(muls[0]), vget_high_u16(sum_elems[0])),
				790	vsubl_u16(vget_low_u16(muls[1]), vget_low_u16(sum_elems[1])),
				791	vsubl_u16(vget_high_u16(muls[1]), vget_high_u16(sum_elems[1])),
				792	};
				793	for (unsigned int i = 0; i < 4; i++)
				794	{
				795	accs[i] = vaddq_u32(accs[i], tmps[i]);
				796	}
				797	}
				798	}
				799
				800	// Rescale the accumulator and add in the new offset.
				801	uint32x4_t final_accs[4];
				802	for (unsigned int i = 0; i < 4; i++)
				803	{
				804	#ifdef FIXED_POINT_REQUANTISATION
				805	const int32x4_t y = rounding_divide_by_exp2(
				806	saturating_doubling_high_mul(
				807	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				808	),
				809	rescale_parameters.shift
				810	);
				811	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				812	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				813	#else // floating point requantisation
				814	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				815	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				816	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				817	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				818	final_accs[i] = vcvtq_u32_f32(fp_acc);
				819	#endif
				820	}
				821
				822	uint8x16_t output = vcombine_u8(
				823	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1]))),
				824	vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[2]), vqmovn_u32(final_accs[3])))
				825	);
				826
				827	// Apply the activation function
				828	if (Activation == ActivationFunction::ReLU \|\|
				829	Activation == ActivationFunction::ReLU6)
				830	{
				831	output = vmaxq_u8(output, vdupq_n_u8(aqmin));
				832	}
				833	if (Activation == ActivationFunction::ReLU6)
				834	{
				835	output = vminq_u8(output, vdupq_n_u8(aqmax));
				836	}
				837
				838	vst1q_u8(outptrs[oi][oj] + n, output);
				839	}
				840	}
				841	}
				842	#endif // defined(__aarch64__)
				843	for (; n_channels >= 8; n_channels -= 8, n += 8)
				844	{
				845	const int32x4_t biases[2] = {
				846	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				847	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
				848	};
				849	wbptr += 8*sizeof(int32_t);
				850
				851	uint8x8_t weights[KernelRows][KernelCols];
				852	for (unsigned int i = 0; i < KernelRows; i++)
				853	{
				854	for (unsigned int j = 0; j < KernelCols; j++)
				855	{
				856	weights[i][j] = vld1_u8(wbptr);
				857	wbptr += 8;
				858	}
				859	}
				860
				861	uint8x8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				862	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
				863	{
				864	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
				865	{
				866	inputs[i][j] = vld1_u8(inptrs[i][j] + n);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	867	}
				868	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	869
				870	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				871	{
				872	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				873	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	874	uint32x4_t accs[2];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	875	for (unsigned int i = 0; i < 2; i++)
				876	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	877	accs[i] = reinterpret_cast<uint32x4_t>(biases[i]);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	878	}
				879
				880	for (unsigned int wi = 0; wi < KernelRows; wi++)
				881	{
				882	for (unsigned int wj = 0; wj < KernelCols; wj++)
				883	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	884	const uint8x8_t w = weights[wi][wj];
				885	const uint8x8_t x = inputs[oiStrideRows + wi][ojStrideCols + wj];
				886
				887	const uint16x8_t muls = vmull_u8(w, x);
				888	const uint8x8_t woffset = vdup_n_u8(_weights_quant.offset);
				889	const uint16x8_t sum_elems = vmull_u8(x, woffset);
				890
				891	const uint32x4_t tmps[2] = {
				892	vsubl_u16(vget_low_u16(muls), vget_low_u16(sum_elems)),
				893	vsubl_u16(vget_high_u16(muls), vget_high_u16(sum_elems)),
				894	};
				895	for (unsigned int i = 0; i < 2; i++)
				896	{
				897	accs[i] = vaddq_u32(accs[i], tmps[i]);
				898	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	899	}
				900	}
				901
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	902	uint32x4_t final_accs[2];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	903	for (unsigned int i = 0; i < 2; i++)
				904	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	905	#ifdef FIXED_POINT_REQUANTISATION
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	906	const int32x4_t y = rounding_divide_by_exp2(
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	907	saturating_doubling_high_mul(
				908	reinterpret_cast<int32x4_t>(accs[i]), rescale_parameters.multiplier
				909	),
				910	rescale_parameters.shift
				911	);
				912	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(_output_quant.offset));
				913	final_accs[i] = reinterpret_cast<uint32x4_t>(vmaxq_s32(vaddq_s32(y, offset), vdupq_n_s32(0)));
				914	#else // floating point requantisation
				915	float32x4_t fp_acc = vcvtq_f32_s32(reinterpret_cast<int32x4_t>(accs[i]));
				916	fp_acc = vmulq_f32(fp_acc, vdupq_n_f32(rescale_parameters.rescale));
				917	fp_acc = vaddq_f32(fp_acc, vdupq_n_f32(static_cast<float>(_output_quant.offset)));
				918	fp_acc = vmaxq_f32(fp_acc, vdupq_n_f32(0.0f));
				919	final_accs[i] = vcvtq_u32_f32(fp_acc);
				920	#endif
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	921	}
				922
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	923	uint8x8_t output = vqmovn_u16(vcombine_u16(vqmovn_u32(final_accs[0]), vqmovn_u32(final_accs[1])));
				924
				925	// Apply the activation function
				926	if (Activation == ActivationFunction::ReLU \|\|
				927	Activation == ActivationFunction::ReLU6)
				928	{
				929	output = vmax_u8(output, vdup_n_u8(aqmin));
				930	}
				931	if (Activation == ActivationFunction::ReLU6)
				932	{
				933	output = vmin_u8(output, vdup_n_u8(aqmax));
				934	}
				935
				936	vst1_u8(outptrs[oi][oj] + n, output);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	937	}
				938	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	939	}
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	940	for (; n_channels; n_channels--, n++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	941	{
				942	// Load bias
				943	const int32_t bias = reinterpret_cast<const int32_t >(wbptr);
				944	wbptr += sizeof(int32_t);
				945
				946	// Load weights
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	947	uint8_t weights[KernelRows][KernelCols];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	948	for (unsigned int i = 0; i < KernelRows; i++)
				949	{
				950	for (unsigned int j = 0; j < KernelCols; j++)
				951	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	952	weights[i][j] = *(wbptr++);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	953	}
				954	}
				955
				956	// Load the input activations
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	957	uint8_t inputs[Base::inner_tile_rows][Base::inner_tile_cols];
				958	for (unsigned int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	959	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	960	for (unsigned int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	961	{
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	962	inputs[i][j] = *(inptrs[i][j] + n);
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	963	}
				964	}
				965
				966	// Perform the convolution
				967	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				968	{
				969	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				970	{
				971	int32_t acc = bias;
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	972	uint32_t element_sum = 0;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	973
				974	for (unsigned int wi = 0; wi < KernelRows; wi++)
				975	{
				976	for (unsigned int wj = 0; wj < KernelCols; wj++)
				977	{
				978	const auto w = weights[wi][wj], x = inputs[oiStrideRows + wi][ojStrideCols + wj];
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	979	acc += static_cast<int32_t>(static_cast<uint32_t>(w) * static_cast<uint32_t>(x));
				980	element_sum += static_cast<uint32_t>(x);
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	981	}
				982	}
				983
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	984	acc -= static_cast<int32_t>(element_sum) * static_cast<int32_t>(_weights_quant.offset);
				985
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	986	// Requantize
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	987	#ifdef FIXED_POINT_REQUANTISATION
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	988	acc = rounding_divide_by_exp2(
Georgios Pinitas	30271c7	2019-06-24 14:56:34 +0100	[diff] [blame^]	989	saturating_doubling_high_mul(acc, rescale_parameters.multiplier),
				990	rescale_parameters.shift
				991	);
				992	acc += _output_quant.offset;
				993	uint8_t output = clamp_to_limits<uint8_t>::clamp_and_cast<int32_t>(acc);
				994	#else // floating point requantization
				995	float fp_acc = static_cast<float>(acc);
				996	fp_acc *= rescale_parameters.rescale;
				997	fp_acc += static_cast<float>(_output_quant.offset);
				998	fp_acc = std::max<float>(fp_acc, 0.0f);
				999	uint8_t output = static_cast<uint8_t>(std::min<int32_t>(static_cast<int32_t>(fp_acc), 255));
				1000	#endif
				1001
				1002	// Apply the activation function
				1003	if (Activation == ActivationFunction::ReLU \|\|
				1004	Activation == ActivationFunction::ReLU6)
				1005	{
				1006	output = std::max(output, aqmin);
				1007	}
				1008	if (Activation == ActivationFunction::ReLU6)
				1009	{
				1010	output = std::min(output, aqmax);
				1011	}
				1012
				1013	*(outptrs[oi][oj] + n) = output;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	1014	}
				1015	}
				1016	}
				1017	}
				1018
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	1019	} // namespace depthwise