Blame - src/core/NEON/kernels/NEAccumulateKernel.cpp - ml/ComputeLibrary

blob: 6e54dd64a31fa46c98cc461aed8282657b176aae [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/Types.h"
				30	#include "arm_compute/core/Validate.h"
				31
				32	#include <arm_neon.h>
				33
				34	using namespace arm_compute;
				35
				36	namespace arm_compute
				37	{
				38	class Coordinates;
				39	} // namespace arm_compute
				40
				41	/* Max S16 value used for saturation purposes. */
				42	const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
				43
				44	#ifdef ARM_COMPUTE_ENABLE_FP16
				45	namespace fp16
				46	{
				47	inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input)
				48	{
				49	const float16x8x2_t out =
				50	{
				51	{
				52	vcvtq_f16_u16(vmovl_u8(vget_low_u8(input))),
				53	vcvtq_f16_u16(vmovl_u8(vget_high_u8(input)))
				54	}
				55	};
				56
				57	return out;
				58	}
				59
				60	inline uint8x16_t convert_f16x8x2_to_u8x16(const float16x8x2_t &input)
				61	{
				62	return vcombine_u8(vmovn_u16(vcvtq_u16_f16(input.val[0])),
				63	vmovn_u16(vcvtq_u16_f16(input.val[1])));
				64	}
				65
				66	inline float16x8x2_t vector_accumulate_weighted(const float16x8x2_t &vec0, const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2)
				67	{
				68	const float16x8x2_t res =
				69	{
				70	{
				71	vfmaq_f16(vmulq_f16(vec1.val[0], scale_val), vec0.val[0], scale_val2),
				72	vfmaq_f16(vmulq_f16(vec1.val[1], scale_val), vec0.val[1], scale_val2)
				73	}
				74	};
				75
				76	return res;
				77	}
				78
				79	void acc_we_v16_u8(const void __restrict input, void __restrict accum, float16x8_t scale_val, float16x8_t scale_val2)
				80	{
				81	ARM_COMPUTE_ERROR_ON(nullptr == input);
				82	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				83
				84	const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
				85	const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
				86
				87	const uint8x16x4_t input_buffer = vld4q_u8(input_ptr);
				88	uint8x16x4_t accum_buffer = vld4q_u8(accum_ptr);
				89
				90	const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]);
				91	const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]);
				92	const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]);
				93	const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]);
				94
				95	float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]);
				96	float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]);
				97	float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]);
				98	float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]);
				99
				100	f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2);
				101	f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2);
				102	f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2);
				103	f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2);
				104
				105	accum_buffer = { {
				106	convert_f16x8x2_to_u8x16(f16_accum_0),
				107	convert_f16x8x2_to_u8x16(f16_accum_1),
				108	convert_f16x8x2_to_u8x16(f16_accum_2),
				109	convert_f16x8x2_to_u8x16(f16_accum_3)
				110	}
				111	};
				112
				113	vst4q_u8(accum_ptr, accum_buffer);
				114	}
				115	} // namespace fp16
				116
				117	void NEAccumulateWeightedFP16Kernel::run(const Window &window)
				118	{
				119	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				120	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				121
				122	Iterator input(_input, window);
				123	Iterator accum(_output, window);
				124
				125	const float16x8_t scale_val = vdupq_n_f16(1.f - _alpha);
				126	const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
				127
				128	execute_window_loop(window, [&](const Coordinates & id)
				129	{
				130	fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
				131	},
				132	input, accum);
				133	}
Anthony Barbier	ac69aa1	2017-07-03 17:39:37 +0100	[diff] [blame]	134	#endif /* ARM_COMPUTE_ENABLE_FP16 */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	135
				136	namespace
				137	{
				138	inline void acc_v16_u8(const void __restrict input, void __restrict accum)
				139	{
				140	ARM_COMPUTE_ERROR_ON(nullptr == input);
				141	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				142
				143	const auto in = static_cast<const uint8_t *__restrict>(input);
				144	const auto out = static_cast<int16_t *__restrict>(accum);
				145
				146	uint8x16_t ta1 = vld1q_u8(in);
				147	int16x8_t ta2 = vld1q_s16(out);
				148	int16x8_t ta3 = vld1q_s16(out + 8);
				149
				150	ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1))));
				151	ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1))));
				152
				153	vst1q_s16(out, ta2);
				154	vst1q_s16(out + 8, ta3);
				155	}
				156
				157	inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input)
				158	{
				159	const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input));
				160	const uint16x8_t u16_output_hi = vmovl_u8(vget_high_u8(input));
				161
				162	const float32x4x4_t res =
				163	{
				164	{
				165	vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))),
				166	vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))),
				167	vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))),
				168	vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi)))
				169	}
				170	};
				171
				172	return res;
				173	}
				174
				175	inline uint8x16_t convert_f32x4x4_to_u8x16(const float32x4x4_t &input)
				176	{
				177	return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])),
				178	vmovn_u32(vcvtq_u32_f32(input.val[1])))),
				179	vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])),
				180	vmovn_u32(vcvtq_u32_f32(input.val[3])))));
				181	}
				182
				183	inline float32x4x4_t vector_accumulate_weighted(const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2)
				184	{
				185	vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val);
				186	vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val);
				187	vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val);
				188	vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val);
				189
				190	vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2);
				191	vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2);
				192	vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2);
				193	vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2);
				194
				195	return vector_output;
				196	}
				197
				198	inline void acc_we_v16_u8(const void __restrict input, void __restrict accum, const float32x4_t scale_val, const float32x4_t scale_val2)
				199	{
				200	ARM_COMPUTE_ERROR_ON(nullptr == input);
				201	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				202
				203	const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
				204	const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
				205
				206	const uint8x16_t input_buffer = vld1q_u8(input_ptr);
				207	const uint8x16_t accum_buffer = vld1q_u8(accum_ptr);
				208
				209	const float32x4x4_t f32_input_0 = convert_u8x16_to_f32x4x4(input_buffer);
				210	const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer);
				211
				212	const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2);
				213
				214	vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0));
				215	}
				216
				217	void acc_sq_v16_u8(const void __restrict input, uint32_t shift, void __restrict accum)
				218	{
				219	ARM_COMPUTE_ERROR_ON(nullptr == input);
				220	ARM_COMPUTE_ERROR_ON(nullptr == accum);
				221	ARM_COMPUTE_ERROR_ON(shift > 15);
				222
				223	const auto input_buffer = static_cast<const uint8_t *__restrict>(input);
				224	const auto accum_buffer = static_cast<int16_t *__restrict>(accum);
				225
				226	const uint8x16_t ta1 = vld1q_u8(input_buffer);
				227	uint16x8_t ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer));
				228	uint16x8_t ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8));
				229
				230	const int16x8_t vector_shift = vdupq_n_s16(-static_cast<int16_t>(shift));
				231
				232	uint16x8_t linput = vmovl_u8(vget_low_u8(ta1));
				233	uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1));
				234
				235	linput = vmulq_u16(linput, linput);
				236	hinput = vmulq_u16(hinput, hinput);
				237
				238	linput = vqshlq_u16(linput, vector_shift);
				239	hinput = vqshlq_u16(hinput, vector_shift);
				240
				241	ta2 = vqaddq_u16(ta2, linput);
				242	ta3 = vqaddq_u16(ta3, hinput);
				243
				244	vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2)));
				245	vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3)));
				246	}
				247	} // namespace
				248
				249	void NEAccumulateKernel::configure(const ITensor input, ITensor accum)
				250	{
				251	ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
				252
				253	set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
				254
				255	set_format_if_unknown(*accum->info(), Format::S16);
				256
				257	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				258	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
				259	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
				260
				261	constexpr unsigned int num_elems_processed_per_iteration = 16;
				262	INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
				263	}
				264
				265	void NEAccumulateKernel::run(const Window &window)
				266	{
				267	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				268	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				269	Iterator input(_input, window);
				270	Iterator accum(_output, window);
				271
				272	execute_window_loop(window, [&](const Coordinates & id)
				273	{
				274	acc_v16_u8(input.ptr(), accum.ptr());
				275	},
				276	input, accum);
				277	}
				278
				279	NEAccumulateWeightedKernel::NEAccumulateWeightedKernel()
				280	: _alpha(0.0f)
				281	{
				282	}
				283
				284	void NEAccumulateWeightedKernel::configure(const ITensor input, float alpha, ITensor accum)
				285	{
				286	ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
				287
				288	set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
				289
				290	set_format_if_unknown(*accum->info(), Format::U8);
				291
				292	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
				293	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				294	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
				295	ARM_COMPUTE_ERROR_ON(alpha < 0.0 \|\| alpha > 1.0);
				296
				297	_alpha = alpha;
				298
				299	constexpr unsigned int num_elems_processed_per_iteration = 16;
				300	INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
				301	}
				302
				303	void NEAccumulateWeightedKernel::run(const Window &window)
				304	{
				305	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				306	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				307
				308	Iterator input(_input, window);
				309	Iterator accum(_output, window);
				310
				311	const float32x4_t scale_val = vdupq_n_f32(1.f - _alpha);
				312	const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
				313
				314	execute_window_loop(window, [&](const Coordinates & id)
				315	{
				316	acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
				317	},
				318	input, accum);
				319	}
				320
				321	NEAccumulateSquaredKernel::NEAccumulateSquaredKernel()
				322	: _shift(0)
				323	{
				324	}
				325
				326	void NEAccumulateSquaredKernel::configure(const ITensor input, uint32_t shift, ITensor accum)
				327	{
				328	ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
				329
				330	set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
				331
				332	set_format_if_unknown(*accum->info(), Format::S16);
				333
				334	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
				335	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				336	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
				337	ARM_COMPUTE_ERROR_ON(shift > 15);
				338
				339	_shift = shift;
				340
				341	constexpr unsigned int num_elems_processed_per_iteration = 16;
				342	INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
				343	}
				344
				345	void NEAccumulateSquaredKernel::run(const Window &window)
				346	{
				347	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				348	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
				349	Iterator input(_input, window);
				350	Iterator accum(_output, window);
				351
				352	execute_window_loop(window, [&](const Coordinates & id)
				353	{
				354	acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
				355	},
				356	input, accum);
				357	}