Blame - src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp - ml/ComputeLibrary

blob: 5546d37e5981427dc1aa868baa73f9166638f146 [file] [log] [blame]

Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2019 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24
				25	/*
				26	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				27	*
				28	* NOTE: Header to be included by implementation files only.
				29	*
				30	* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
				31	*/
				32
				33	#include <limits>
				34
				35	#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
				36	#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
				37	#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp"
				38
				39	#pragma once
				40
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	41	using namespace neon_convolution_kernels;
				42	using namespace qasymm8;
				43
				44	template <typename T>
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	45	inline T saturating_doubling_high_mul(const T&, const int32_t&);
				46
				47	template <>
				48	inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
				49	{
				50	return vqrdmulhq_n_s32(a, b);
				51	}
				52
				53	template <>
				54	inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
				55	{
				56	return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
				57	}
				58
				59	template <typename T>
				60	inline T rounding_divide_by_exp2(const T& x, const int exponent);
				61
				62	template <>
				63	inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
				64	{
				65	const int32x4_t shift = vdupq_n_s32(-exponent);
				66	const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
				67	const int32x4_t fixed = vqaddq_s32(x, fixup);
				68	return vrshlq_s32(fixed, shift);
				69	}
				70
				71	template <>
				72	inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
				73	{
				74	const int32x2_t shift = vdup_n_s32(-exponent);
				75	const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
				76	const int32x2_t fixed = vqadd_s32(x, fixup);
				77	return vrshl_s32(fixed, shift);
				78	}
				79
				80	template <>
				81	inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
				82	{
				83	const int32x2_t xs = vdup_n_s32(x);
				84	return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
				85	}
				86
				87	namespace depthwise
				88	{
				89	template <
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	90	unsigned int OutputTileRows, unsigned int OutputTileCols,
				91	unsigned int KernelRows, unsigned int KernelCols,
				92	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	93	>
				94	QAsymm8DepthwiseConvolution<
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	95	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	96	>::QAsymm8DepthwiseConvolution(
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	97	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				98	const ActivationFunction activation,
				99	const QAsymm8Params& weight_quantisation,
				100	const QAsymm8Params& input_quantisation,
				101	const QAsymm8Params& output_quantisation,
				102	unsigned int padding_top,
				103	unsigned int padding_left,
				104	unsigned int padding_bottom,
				105	unsigned int padding_right
				106	) : QAsymm8DepthwiseConvolution(
				107	n_batches, n_input_rows, n_input_cols, n_channels,
				108	activation, weight_quantisation, input_quantisation, output_quantisation,
				109	QAsymm8RescaleParams::make_rescale_params(weight_quantisation, input_quantisation, output_quantisation),
				110	padding_top, padding_left, padding_bottom, padding_right
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	111	)
				112	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	113	}
				114
				115	template <
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	116	unsigned int OutputTileRows, unsigned int OutputTileCols,
				117	unsigned int KernelRows, unsigned int KernelCols,
				118	unsigned int StrideRows, unsigned int StrideCols
				119	>
				120	QAsymm8DepthwiseConvolution<
				121	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				122	>::QAsymm8DepthwiseConvolution(
				123	int n_batches, int n_input_rows, int n_input_cols, int n_channels,
				124	const ActivationFunction activation,
				125	const QAsymm8Params& weight_quantisation,
				126	const QAsymm8Params& input_quantisation,
				127	const QAsymm8Params& output_quantisation,
				128	const QAsymm8RescaleParams& rescale_params,
				129	unsigned int padding_top,
				130	unsigned int padding_left,
				131	unsigned int padding_bottom,
				132	unsigned int padding_right
				133	) : Base(
				134	n_batches, n_input_rows, n_input_cols, n_channels, activation,
				135	padding_top, padding_left, padding_bottom, padding_right
				136	),
				137	_weights_quant(weight_quantisation),
				138	_inputs_quant(input_quantisation),
				139	_output_quant(output_quantisation),
				140	rescale_parameters(rescale_params)
				141	{
				142	}
				143
				144	template <
				145	unsigned int OutputTileRows, unsigned int OutputTileCols,
				146	unsigned int KernelRows, unsigned int KernelCols,
				147	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	148	>
				149	uint8_t QAsymm8DepthwiseConvolution<
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	150	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	151	>::_input_padding_value(void) const
				152	{
				153	return _inputs_quant.offset;
				154	}
				155
				156	template <
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	157	unsigned int OutputTileRows, unsigned int OutputTileCols,
				158	unsigned int KernelRows, unsigned int KernelCols,
				159	unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	160	>
				161	void QAsymm8DepthwiseConvolution<
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	162	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	163	>::_pack_params(
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	164	void * const buffer,
				165	const void * const weights,
				166	const unsigned int weight_row_stride,
				167	const unsigned int weight_col_stride,
				168	const void * const biases
				169	) const
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	170	{
				171	const uint8_t wptr = static_cast<const uint8_t >(weights);
				172	const int32_t bptr = static_cast<const int32_t >(biases);
				173	uint8_t outptr = static_cast<uint8_t >(buffer);
				174
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	175	// We set the vector length to use doubles on both Aarch64 and Aarch32. NOTE
				176	// For SVE set this to half the vector length.
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	177	unsigned int veclen = 8;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	178
				179	// While there are channels left to process, pack a vector length of them at
				180	// a time and reduce the size of vector used as the size of the tensor
				181	// decreases.
				182	for (
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	183	unsigned int n_channels = this->n_channels(); n_channels;
				184	n_channels -= veclen,
				185	outptr += veclen(sizeof(int32_t) + this->kernel_rowsthis->kernel_cols)
				186	)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	187	{
				188	// NOTE Ignore this section if using SVE, the vector length remains the
				189	// same and we just don't fill a full register for the tail.
				190	while (n_channels < veclen)
				191	{
				192	// Reduce the vector length to either 8 or 1 (scalar)
				193	// TODO Support more vector lengths in `execute_tile`.
				194	veclen = (veclen == 16) ? 8 : 1;
				195	}
				196
				197	// Get pointers to bias and weight portions of the output structure.
				198	int32_t out_bptr = reinterpret_cast<int32_t >(outptr);
				199	uint8_t out_wptr = outptr + veclensizeof(int32_t);
				200
				201	// Copy a vector length of elements
				202	for (unsigned int n = 0; n < veclen && n < n_channels; n++)
				203	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	204	const int32_t bias = (bptr != nullptr) ? *(bptr++) : 0;
				205	out_bptr[n] = bias;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	206
				207	for (unsigned int i = 0; i < KernelRows; i++)
				208	{
				209	uint8_t row_outptr = out_wptr + iKernelCols*veclen;
				210	for (unsigned int j = 0; j < KernelCols; j++)
				211	{
				212	uint8_t w = (wptr + iweight_row_stride + j*weight_col_stride);
				213	row_outptr[j*veclen + n] = w;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	214	}
				215	}
				216	wptr++;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	217	}
				218	}
				219	}
				220
				221	template <
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	222	unsigned int OutputTileRows, unsigned int OutputTileCols,
				223	unsigned int KernelRows, unsigned int KernelCols,
				224	unsigned int StrideRows, unsigned int StrideCols,
				225	typename FInput, typename FOutput
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	226	>
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	227	static inline void tilefn(
				228	int n_channels,
				229	const void* packed_params,
				230	FInput &get_input_ptr,
				231	FOutput &get_output_ptr,
				232	const int32_t clamp_max,
				233	const int32_t clamp_min,
				234	const uint8_t input_offset,
				235	const uint8_t weight_offset,
				236	const uint8_t output_offset,
				237	const int32_t requant_multiplier,
				238	const int32_t requant_shift
				239	)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	240	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	241	constexpr int InnerTileRows = StrideRows * (OutputTileRows - 1) + KernelRows;
				242	constexpr int InnerTileCols = StrideCols * (OutputTileCols - 1) + KernelCols;
				243
				244	// Offset into channels
				245	int channel = 0;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	246
				247	// Byte type pointer to weights and biases
				248	const uint8_t wbptr = static_cast<const uint8_t >(packed_params);
				249
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	250	for (; n_channels >= 8; n_channels -= 8, channel += 8)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	251	{
				252	const int32x4_t biases[2] = {
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	253	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr)),
				254	vld1q_s32(reinterpret_cast<const int32_t *>(wbptr) + 4),
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	255	};
				256	wbptr += 8*sizeof(int32_t);
				257
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	258	int16x8_t weights[KernelRows][KernelCols];
				259	const uint8x8_t woffset = vdup_n_u8(weight_offset);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	260	for (unsigned int i = 0; i < KernelRows; i++)
				261	{
				262	for (unsigned int j = 0; j < KernelCols; j++)
				263	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	264	const uint8x8_t w = vld1_u8(wbptr);
				265	weights[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(w, woffset));
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	266	wbptr += 8;
				267	}
				268	}
				269
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	270	int16x8_t inputs[InnerTileRows][InnerTileCols];
				271	const uint8x8_t ioffset = vdup_n_u8(input_offset);
				272	for (unsigned int i = 0; i < InnerTileRows; i++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	273	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	274	for (unsigned int j = 0; j < InnerTileCols; j++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	275	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	276	const auto x = vld1_u8(get_input_ptr(i, j, channel));
				277	inputs[i][j] = reinterpret_cast<int16x8_t>(vsubl_u8(x, ioffset));
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	278	}
				279	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	280
				281	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				282	{
				283	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				284	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	285	int32x4_t accs[2];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	286	for (unsigned int i = 0; i < 2; i++)
				287	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	288	accs[i] = biases[i];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	289	}
				290
				291	for (unsigned int wi = 0; wi < KernelRows; wi++)
				292	{
				293	for (unsigned int wj = 0; wj < KernelCols; wj++)
				294	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	295	const auto w = weights[wi][wj];
				296	const auto x = inputs[oi * StrideRows + wi][oj * StrideCols + wj];
				297	accs[0] = vmlal_s16(accs[0], vget_low_s16(w), vget_low_s16(x));
				298	accs[1] = vmlal_s16(accs[1], vget_high_s16(w), vget_high_s16(x));
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	299	}
				300	}
				301
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	302	int32x4_t final_accs[2];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	303	for (unsigned int i = 0; i < 2; i++)
				304	{
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	305	const int32x4_t y = rounding_divide_by_exp2(
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	306	saturating_doubling_high_mul(accs[i], requant_multiplier),
				307	requant_shift);
				308	const int32x4_t offset = reinterpret_cast<int32x4_t>(vdupq_n_u32(output_offset));
				309	final_accs[i] = vaddq_s32(y, offset);
				310	final_accs[i] = vmaxq_s32(final_accs[i], vdupq_n_s32(clamp_min));
				311	final_accs[i] = vminq_s32(final_accs[i], vdupq_n_s32(clamp_max));
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	312	}
				313
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	314	const auto elems_s16 = vuzpq_s16(vreinterpretq_s16_s32(final_accs[0]),
				315	vreinterpretq_s16_s32(final_accs[1]));
				316	const int8x16_t elems = vreinterpretq_s8_s16(elems_s16.val[0]);
				317	const uint8x8_t output =
				318	vget_low_u8(vreinterpretq_u8_s8(vuzpq_s8(elems, elems).val[0]));
				319	vst1_u8(get_output_ptr(oi, oj, channel), output);
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	320	}
				321	}
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	322	}
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	323	for (; n_channels; n_channels--, channel++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	324	{
				325	// Load bias
				326	const int32_t bias = reinterpret_cast<const int32_t >(wbptr);
				327	wbptr += sizeof(int32_t);
				328
				329	// Load weights
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	330	int16_t weights[KernelRows][KernelCols];
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	331	for (unsigned int i = 0; i < KernelRows; i++)
				332	{
				333	for (unsigned int j = 0; j < KernelCols; j++)
				334	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	335	weights[i][j] = *(wbptr++) - weight_offset;
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	336	}
				337	}
				338
				339	// Load the input activations
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	340	int16_t inputs[InnerTileRows][InnerTileCols];
				341	for (unsigned int i = 0; i < InnerTileRows; i++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	342	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	343	for (unsigned int j = 0; j < InnerTileCols; j++)
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	344	{
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	345	inputs[i][j] = *(get_input_ptr(i, j, channel)) - input_offset;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	346	}
				347	}
				348
				349	// Perform the convolution
				350	for (unsigned int oi = 0; oi < OutputTileRows; oi++)
				351	{
				352	for (unsigned int oj = 0; oj < OutputTileCols; oj++)
				353	{
				354	int32_t acc = bias;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	355
				356	for (unsigned int wi = 0; wi < KernelRows; wi++)
				357	{
				358	for (unsigned int wj = 0; wj < KernelCols; wj++)
				359	{
				360	const auto w = weights[wi][wj], x = inputs[oiStrideRows + wi][ojStrideCols + wj];
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	361	acc += w * x;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	362	}
				363	}
				364
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	365	// Requantize
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	366	acc = rounding_divide_by_exp2(
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	367	saturating_doubling_high_mul(acc, requant_multiplier),
				368	requant_shift);
				369	acc += output_offset;
				370	acc = std::max(acc, clamp_min);
				371	acc = std::min(acc, clamp_max);
				372	uint8_t output = static_cast<uint8_t>(acc);
				373	*(get_output_ptr(oi, oj, channel)) = output;
Georgios Pinitas	a4bba9c	2019-04-02 15:27:52 +0100	[diff] [blame]	374	}
				375	}
				376	}
				377	}
				378
Georgios Pinitas	a851bba	2019-04-12 13:55:39 +0100	[diff] [blame]	379	template <
				380	unsigned int OutputTileRows, unsigned int OutputTileCols,
				381	unsigned int KernelRows, unsigned int KernelCols,
				382	unsigned int StrideRows, unsigned int StrideCols,
				383	typename FInput, typename FOutput
				384	>
				385	static inline void execute_tilefn(
				386	int n_channels,
				387	const void* packed_params,
				388	const nck::ActivationFunction actfn,
				389	FInput &get_input_ptr,
				390	FOutput &get_output_ptr,
				391	const QAsymm8Params &input_quant,
				392	const QAsymm8Params &weight_quant,
				393	const QAsymm8Params &output_quant,
				394	const QAsymm8RescaleParams &requant
				395	) {
				396	// Compute min/max clamp values
				397	int32_t clamp_min = std::numeric_limits<uint8_t>::min();
				398	int32_t clamp_max = std::numeric_limits<uint8_t>::max();
				399
				400	if (actfn == nck::ActivationFunction::ReLU \|\|
				401	actfn == nck::ActivationFunction::ReLU6) {
				402	const int32_t bottom_rail = output_quant.offset;
				403	clamp_min = std::max(clamp_min, bottom_rail);
				404	}
				405
				406	if (actfn == nck::ActivationFunction::ReLU6) {
				407	const int32_t top_rail = output_quant.quantize(6.0f);
				408	clamp_max = std::min(clamp_max, top_rail);
				409	}
				410
				411	// Call the tile execution method
				412	tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
				413	StrideCols>(n_channels, packed_params, get_input_ptr, get_output_ptr,
				414	clamp_max, clamp_min, input_quant.offset,
				415	weight_quant.offset, output_quant.offset,
				416	requant.multiplier, requant.shift);
				417	}
				418
				419	template <
				420	unsigned int OutputTileRows, unsigned int OutputTileCols,
				421	unsigned int KernelRows, unsigned int KernelCols,
				422	unsigned int StrideRows, unsigned int StrideCols
				423	>
				424	template <nck::ActivationFunction Activation>
				425	void QAsymm8DepthwiseConvolution<
				426	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				427	>::execute_tile(
				428	int n_channels,
				429	const void* packed_params,
				430	const uint8_t* inptr,
				431	unsigned int in_row_stride,
				432	unsigned int in_col_stride,
				433	uint8_t* outptr,
				434	unsigned int out_row_stride,
				435	unsigned int out_col_stride
				436	) {
				437	// Construct methods to get pointers
				438	const auto get_input_ptr = [inptr, in_row_stride, in_col_stride](
				439	const int i, const int j, const int channel) {
				440	return inptr + i * in_row_stride + j * in_col_stride + channel;
				441	};
				442
				443	const auto get_output_ptr = [outptr, out_row_stride, out_col_stride](
				444	const int i, const int j, const int channel) {
				445	return outptr + i * out_row_stride + j * out_col_stride + channel;
				446	};
				447
				448	execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
				449	StrideRows, StrideCols>(
				450	n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
				451	_inputs_quant, _weights_quant, _output_quant, rescale_parameters);
				452	}
				453
				454	template <
				455	unsigned int OutputTileRows, unsigned int OutputTileCols,
				456	unsigned int KernelRows, unsigned int KernelCols,
				457	unsigned int StrideRows, unsigned int StrideCols
				458	>
				459	template <nck::ActivationFunction Activation>
				460	void QAsymm8DepthwiseConvolution<
				461	OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols
				462	>::execute_tile(
				463	int n_channels,
				464	const void* packed_params,
				465	const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
				466	uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
				467	) {
				468	// Construct methods to get pointers
				469	const auto get_input_ptr = [inptrs](const int i, const int j,
				470	const int channel) {
				471	return inptrs[i][j] + channel;
				472	};
				473
				474	const auto get_output_ptr = [outptrs](const int i, const int j,
				475	const int channel) {
				476	return outptrs[i][j] + channel;
				477	};
				478
				479	// Call the tile execution method
				480	execute_tilefn<OutputTileRows, OutputTileCols, KernelRows, KernelCols,
				481	StrideRows, StrideCols>(
				482	n_channels, packed_params, Activation, get_input_ptr, get_output_ptr,
				483	_inputs_quant, _weights_quant, _output_quant, rescale_parameters);
				484	}
				485
Georgios Pinitas	47d39dc	2019-03-11 14:03:23 +0000	[diff] [blame]	486	} // namespace depthwise