Blame - src/core/NEON/kernels/NEAccumulateKernel.cpp - ml/ComputeLibrary

blob: 46179cadcb5d07af991a3f752e14ac9acadea4ee [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	2	* Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	24	#include "src/core/NEON/kernels/NEAccumulateKernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/Types.h"
				30	#include "arm_compute/core/Validate.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	31	#include "src/core/helpers/AutoConfiguration.h"
				32	#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	33
				34	#include <arm_neon.h>
				35
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	36	namespace arm_compute
				37	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	38	/* Max S16 value used for saturation purposes. */
				39	const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
				40
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	41	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	42	namespace fp16
				43	{
				44	inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input)
				45	{
				46	const float16x8x2_t out =
				47	{
				48	{
				49	vcvtq_f16_u16(vmovl_u8(vget_low_u8(input))),
				50	vcvtq_f16_u16(vmovl_u8(vget_high_u8(input)))
				51	}
				52	};
				53
				54	return out;
				55	}
				56
				57	inline uint8x16_t convert_f16x8x2_to_u8x16(const float16x8x2_t &input)
				58	{
				59	return vcombine_u8(vmovn_u16(vcvtq_u16_f16(input.val[0])),
				60	vmovn_u16(vcvtq_u16_f16(input.val[1])));
				61	}
				62
				63	inline float16x8x2_t vector_accumulate_weighted(const float16x8x2_t &vec0, const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2)
				64	{
				65	const float16x8x2_t res =
				66	{
				67	{
				68	vfmaq_f16(vmulq_f16(vec1.val[0], scale_val), vec0.val[0], scale_val2),
				69	vfmaq_f16(vmulq_f16(vec1.val[1], scale_val), vec0.val[1], scale_val2)
				70	}
				71	};
				72
				73	return res;
				74	}
				75
				76	void acc_we_v16_u8(const void __restrict input, void __restrict accum, float16x8_t scale_val, float16x8_t scale_val2)
				77	{
				78	ARM_COMPUTE_ERROR_ON(nullptr == input);
				79	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				80
				81	const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
				82	const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
				83
				84	const uint8x16x4_t input_buffer = vld4q_u8(input_ptr);
				85	uint8x16x4_t accum_buffer = vld4q_u8(accum_ptr);
				86
				87	const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]);
				88	const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]);
				89	const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]);
				90	const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]);
				91
				92	float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]);
				93	float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]);
				94	float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]);
				95	float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]);
				96
				97	f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2);
				98	f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2);
				99	f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2);
				100	f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2);
				101
				102	accum_buffer = { {
				103	convert_f16x8x2_to_u8x16(f16_accum_0),
				104	convert_f16x8x2_to_u8x16(f16_accum_1),
				105	convert_f16x8x2_to_u8x16(f16_accum_2),
				106	convert_f16x8x2_to_u8x16(f16_accum_3)
				107	}
				108	};
				109
				110	vst4q_u8(accum_ptr, accum_buffer);
				111	}
				112	} // namespace fp16
				113
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	114	void NEAccumulateWeightedFP16Kernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	115	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	116	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	117	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				118	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				119
				120	Iterator input(_input, window);
				121	Iterator accum(_output, window);
				122
				123	const float16x8_t scale_val = vdupq_n_f16(1.f - _alpha);
				124	const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
				125
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	126	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	127	{
				128	fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
				129	},
				130	input, accum);
				131	}
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	132	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	133
				134	namespace
				135	{
				136	inline void acc_v16_u8(const void __restrict input, void __restrict accum)
				137	{
				138	ARM_COMPUTE_ERROR_ON(nullptr == input);
				139	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				140
				141	const auto in = static_cast<const uint8_t *__restrict>(input);
				142	const auto out = static_cast<int16_t *__restrict>(accum);
				143
				144	uint8x16_t ta1 = vld1q_u8(in);
				145	int16x8_t ta2 = vld1q_s16(out);
				146	int16x8_t ta3 = vld1q_s16(out + 8);
				147
				148	ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1))));
				149	ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1))));
				150
				151	vst1q_s16(out, ta2);
				152	vst1q_s16(out + 8, ta3);
				153	}
				154
				155	inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input)
				156	{
				157	const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input));
				158	const uint16x8_t u16_output_hi = vmovl_u8(vget_high_u8(input));
				159
				160	const float32x4x4_t res =
				161	{
				162	{
				163	vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))),
				164	vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))),
				165	vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))),
				166	vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi)))
				167	}
				168	};
				169
				170	return res;
				171	}
				172
				173	inline uint8x16_t convert_f32x4x4_to_u8x16(const float32x4x4_t &input)
				174	{
				175	return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])),
				176	vmovn_u32(vcvtq_u32_f32(input.val[1])))),
				177	vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])),
				178	vmovn_u32(vcvtq_u32_f32(input.val[3])))));
				179	}
				180
				181	inline float32x4x4_t vector_accumulate_weighted(const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2)
				182	{
				183	vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val);
				184	vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val);
				185	vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val);
				186	vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val);
				187
				188	vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2);
				189	vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2);
				190	vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2);
				191	vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2);
				192
				193	return vector_output;
				194	}
				195
				196	inline void acc_we_v16_u8(const void __restrict input, void __restrict accum, const float32x4_t scale_val, const float32x4_t scale_val2)
				197	{
				198	ARM_COMPUTE_ERROR_ON(nullptr == input);
				199	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				200
				201	const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
				202	const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
				203
				204	const uint8x16_t input_buffer = vld1q_u8(input_ptr);
				205	const uint8x16_t accum_buffer = vld1q_u8(accum_ptr);
				206
				207	const float32x4x4_t f32_input_0 = convert_u8x16_to_f32x4x4(input_buffer);
				208	const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer);
				209
				210	const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2);
				211
				212	vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0));
				213	}
				214
				215	void acc_sq_v16_u8(const void __restrict input, uint32_t shift, void __restrict accum)
				216	{
				217	ARM_COMPUTE_ERROR_ON(nullptr == input);
				218	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				219	ARM_COMPUTE_ERROR_ON(shift > 15);
				220
				221	const auto input_buffer = static_cast<const uint8_t *__restrict>(input);
				222	const auto accum_buffer = static_cast<int16_t *__restrict>(accum);
				223
				224	const uint8x16_t ta1 = vld1q_u8(input_buffer);
				225	uint16x8_t ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer));
				226	uint16x8_t ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8));
				227
				228	const int16x8_t vector_shift = vdupq_n_s16(-static_cast<int16_t>(shift));
				229
				230	uint16x8_t linput = vmovl_u8(vget_low_u8(ta1));
				231	uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1));
				232
				233	linput = vmulq_u16(linput, linput);
				234	hinput = vmulq_u16(hinput, hinput);
				235
				236	linput = vqshlq_u16(linput, vector_shift);
				237	hinput = vqshlq_u16(hinput, vector_shift);
				238
				239	ta2 = vqaddq_u16(ta2, linput);
				240	ta3 = vqaddq_u16(ta3, hinput);
				241
				242	vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2)));
				243	vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3)));
				244	}
				245	} // namespace
				246
				247	void NEAccumulateKernel::configure(const ITensor input, ITensor accum)
				248	{
				249	ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
				250
				251	set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
				252
				253	set_format_if_unknown(*accum->info(), Format::S16);
				254
				255	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				256	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
				257	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
				258
				259	constexpr unsigned int num_elems_processed_per_iteration = 16;
				260	INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
				261	}
				262
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	263	void NEAccumulateKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	264	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	265	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	266	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				267	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				268	Iterator input(_input, window);
				269	Iterator accum(_output, window);
				270
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	271	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	272	{
				273	acc_v16_u8(input.ptr(), accum.ptr());
				274	},
				275	input, accum);
				276	}
				277
				278	NEAccumulateWeightedKernel::NEAccumulateWeightedKernel()
				279	: _alpha(0.0f)
				280	{
				281	}
				282
				283	void NEAccumulateWeightedKernel::configure(const ITensor input, float alpha, ITensor accum)
				284	{
				285	ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
				286
				287	set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
				288
				289	set_format_if_unknown(*accum->info(), Format::U8);
				290
				291	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
				292	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				293	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
				294	ARM_COMPUTE_ERROR_ON(alpha < 0.0 \|\| alpha > 1.0);
				295
				296	_alpha = alpha;
				297
				298	constexpr unsigned int num_elems_processed_per_iteration = 16;
				299	INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
				300	}
				301
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	302	void NEAccumulateWeightedKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	303	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	304	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	305	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				306	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				307
				308	Iterator input(_input, window);
				309	Iterator accum(_output, window);
				310
				311	const float32x4_t scale_val = vdupq_n_f32(1.f - _alpha);
				312	const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
				313
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	314	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	315	{
				316	acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
				317	},
				318	input, accum);
				319	}
				320
				321	NEAccumulateSquaredKernel::NEAccumulateSquaredKernel()
				322	: _shift(0)
				323	{
				324	}
				325
				326	void NEAccumulateSquaredKernel::configure(const ITensor input, uint32_t shift, ITensor accum)
				327	{
				328	ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
				329
				330	set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
				331
				332	set_format_if_unknown(*accum->info(), Format::S16);
				333
				334	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
				335	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				336	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
				337	ARM_COMPUTE_ERROR_ON(shift > 15);
				338
				339	_shift = shift;
				340
				341	constexpr unsigned int num_elems_processed_per_iteration = 16;
				342	INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
				343	}
				344
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	345	void NEAccumulateSquaredKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	346	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	347	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	348	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				349	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				350	Iterator input(_input, window);
				351	Iterator accum(_output, window);
				352
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	353	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	354	{
				355	acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
				356	},
				357	input, accum);
				358	}
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	359	} // namespace arm_compute