Blame - src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp - ml/ComputeLibrary

blob: 8f97e6ac166dc1bd86239b2110f33f4d8d4b766a [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
				33
				34	#include <arm_neon.h>
				35	#include <cstddef>
				36
				37	using namespace arm_compute;
				38
				39	namespace arm_compute
				40	{
				41	class Coordinates;
				42	} // namespace arm_compute
				43
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame^]	44	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	45	namespace fp16
				46	{
				47	inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
				48	{
				49	// vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
				50	mask = vandq_u16(mask, vcgeq_f16(vc, in0));
				51	mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1)));
				52	mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2)));
				53	}
				54
				55	inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
				56	{
				57	// vc >= nc.val[0], vc > nc.val[2]
				58	mask = vandq_u16(mask, vcgeq_f16(vc, in0));
				59	mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
				60	}
				61
				62	inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
				63	{
				64	// vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
				65	mask = vandq_u16(mask, vcgtq_f16(vc, in0));
				66	mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1)));
				67	mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2)));
				68	}
				69
				70	inline void non_maxima_suppression3x3_F32_F32(const void __restrict in_ptr, void __restrict out_ptr, const uint32_t in_stride)
				71	{
				72	auto in = static_cast<const float *__restrict>(in_ptr) - 1;
				73	const auto out = static_cast<float *__restrict>(out_ptr);
				74
				75	// Get centre scores
				76	const float16x8x2_t vc =
				77	{
				78	vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))),
				79	vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13)))
				80	};
				81
				82	// Neighboring pixels
				83	in -= in_stride;
				84
				85	static const float16x4_t zero_f16x4 = vdup_n_f16(0);
				86	static const uint16x8_t zero_u16 = vdupq_n_u16(0);
				87	static const uint16x8_t true_mask = vceqq_u16(zero_u16, zero_u16);
				88	static const uint16x8x2_t true_mask_x2 =
				89	{
				90	true_mask,
				91	true_mask
				92	};
				93
				94	uint16x8x2_t mask = true_mask_x2;
				95
				96	// Top row
				97	const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
				98	const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
				99	const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
				100
				101	// vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2]
				102	mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]);
				103	mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]);
				104
				105	in += in_stride;
				106
				107	// Middle row
				108	const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
				109	const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
				110	const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
				111
				112	// vc >= nc.val[0], vc > nc.val[2]
				113	mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]);
				114	mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]);
				115
				116	in += in_stride;
				117
				118	// Bottom row
				119	const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4)));
				120	const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12)));
				121	const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4);
				122
				123	// vc > nc.val[0], vc > nc.val[1], vc > nc.val[2]
				124	mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]);
				125	mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]);
				126
				127	// Store
				128	static const float16x8_t zero_f16x8 = vdupq_n_f16(0);
				129
				130	const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8);
				131	vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0)));
				132	vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0)));
				133
				134	const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8);
				135	vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1)));
				136	vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1)));
				137	}
				138
				139	inline void non_maxima_suppression3x3_U8_U8(const void __restrict in_ptr, void __restrict out_ptr, const uint32_t in_stride)
				140	{
				141	auto in = static_cast<const uint8_t *__restrict>(in_ptr) - 1;
				142	const auto out = static_cast<uint8_t *__restrict>(out_ptr);
				143
				144	// Get centre scores
				145	const uint8x16_t vc = vld1q_u8(in + 1);
				146
				147	// Neighboring pixels
				148	in -= in_stride;
				149
				150	// Top row
				151	const uint8x16_t l_nc_0 = vld1q_u8(in);
				152	const uint8x16_t m_nc_0 = vld1q_u8(in + 1);
				153	const uint8x16_t r_nc_0 = vld1q_u8(in + 2);
				154
				155	// Keep center scores if ...
				156	// vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0
				157	uint8x16_t mask = vcgeq_u8(vc, l_nc_0);
				158	mask = vandq_u8(mask, vcgeq_u8(vc, m_nc_0));
				159	mask = vandq_u8(mask, vcgeq_u8(vc, r_nc_0));
				160
				161	in += in_stride;
				162
				163	// Middle row
				164	const uint8x16_t l_nc_1 = vld1q_u8(in);
				165	const uint8x16_t r_nc_1 = vld1q_u8(in + 2);
				166
				167	// ... and ...
				168	// vc >= l_nc_1, vc > r_nc_1
				169	mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1));
				170	mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1));
				171
				172	in += in_stride;
				173
				174	// Bottom row
				175	const uint8x16_t l_nc_2 = vld1q_u8(in);
				176	const uint8x16_t m_nc_2 = vld1q_u8(in + 1);
				177	const uint8x16_t r_nc_2 = vld1q_u8(in + 2);
				178
				179	// ... and ...
				180	// vc > l_nc_2, vc > m_nc_2, vc > r_nc_2
				181	mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2));
				182	mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2));
				183	mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2));
				184
				185	// Store
				186	static const uint8x16_t zero = vdupq_n_u8(0);
				187	vst1q_u8(out, vbslq_u8(mask, vc, zero));
				188	}
				189	} // namespace fp16
				190
				191	void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor input, ITensor output, bool border_undefined)
				192	{
				193	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
				194	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
				195	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				196
				197	_input = input;
				198	_output = output;
				199
				200	switch(input->info()->data_type())
				201	{
				202	case DataType::U8:
				203	_func = &fp16::non_maxima_suppression3x3_U8_U8;
				204	break;
				205	default:
				206	_func = &fp16::non_maxima_suppression3x3_F32_F32;
				207	break;
				208	}
				209
				210	constexpr unsigned int num_elems_processed_per_iteration = 16;
				211	const unsigned int num_elems_read_per_iteration = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
				212	constexpr unsigned int num_elems_written_per_iteration = 16;
				213	constexpr unsigned int num_rows_read_per_iteration = 3;
				214
				215	// Configure kernel window
				216	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				217	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				218
				219	update_window_and_padding(win,
				220	AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				221	output_access);
				222
				223	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				224
				225	INEKernel::configure(win);
				226	}
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame^]	227	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	228
				229	namespace
				230	{
				231	inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void __restrict input_ptr, void __restrict output_ptr, const uint32_t input_stride)
				232	{
				233	auto input = static_cast<const float *__restrict>(input_ptr) - 1;
				234	const auto output = static_cast<float *__restrict>(output_ptr);
				235
				236	// Get centre scores
				237	const float32x4x4_t vc =
				238	{
				239	{
				240	vld1q_f32(input + 1),
				241	vld1q_f32(input + 5),
				242	vld1q_f32(input + 9),
				243	vld1q_f32(input + 13)
				244	}
				245	};
				246
				247	// Neighboring pixels
				248	float32x4x4_t l_nc{ {} };
				249	float32x4x4_t m_nc{ {} };
				250	float32x4x4_t r_nc{ {} };
				251
				252	input -= input_stride;
				253
				254	// Row0 - Low part
				255	float32x4_t tmp_low = vld1q_f32(input);
				256	float32x4_t tmp_high = vld1q_f32(input + 4);
				257	float32x4_t tmp_high1 = vld1q_f32(input + 8);
				258
				259	l_nc.val[0] = tmp_low;
				260	m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
				261	r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
				262
				263	tmp_low = tmp_high;
				264	tmp_high = tmp_high1;
				265
				266	l_nc.val[1] = tmp_low;
				267	m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
				268	r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
				269
				270	// Row0 - High part
				271	tmp_low = tmp_high1;
				272	tmp_high = vld1q_f32(input + 12);
				273	tmp_high1 = vld1q_f32(input + 16);
				274
				275	l_nc.val[2] = tmp_low;
				276	m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
				277	r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
				278
				279	tmp_low = tmp_high;
				280	tmp_high = tmp_high1;
				281
				282	l_nc.val[3] = tmp_low;
				283	m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
				284	r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
				285
				286	// mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2]
				287	uint32x4x4_t mask{ {} };
				288	mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]);
				289	mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0]));
				290	mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0]));
				291	mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]);
				292	mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1]));
				293	mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1]));
				294	mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]);
				295	mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2]));
				296	mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2]));
				297	mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]);
				298	mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3]));
				299	mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3]));
				300
				301	input += input_stride;
				302
				303	// Row1 - Low part
				304	tmp_low = vld1q_f32(input);
				305	tmp_high = vld1q_f32(input + 4);
				306	tmp_high1 = vld1q_f32(input + 8);
				307
				308	l_nc.val[0] = tmp_low;
				309	r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
				310
				311	tmp_low = tmp_high;
				312	tmp_high = tmp_high1;
				313
				314	l_nc.val[1] = tmp_low;
				315	r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
				316
				317	// Row1 - High part
				318	tmp_low = tmp_high1;
				319	tmp_high = vld1q_f32(input + 12);
				320	tmp_high1 = vld1q_f32(input + 16);
				321
				322	l_nc.val[2] = tmp_low;
				323	r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
				324
				325	tmp_low = tmp_high;
				326	tmp_high = tmp_high1;
				327
				328	l_nc.val[3] = tmp_low;
				329	r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
				330
				331	// mc >= nc.val[0], mc > nc.val[2]
				332	mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0]));
				333	mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
				334	mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1]));
				335	mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
				336	mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2]));
				337	mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
				338	mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3]));
				339	mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
				340
				341	input += input_stride;
				342
				343	// Row2 - Low part
				344	tmp_low = vld1q_f32(input);
				345	tmp_high = vld1q_f32(input + 4);
				346	tmp_high1 = vld1q_f32(input + 8);
				347
				348	l_nc.val[0] = tmp_low;
				349	m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1);
				350	r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2);
				351
				352	tmp_low = tmp_high;
				353	tmp_high = tmp_high1;
				354
				355	l_nc.val[1] = tmp_low;
				356	m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1);
				357	r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2);
				358
				359	// Row2 - High part
				360	tmp_low = tmp_high1;
				361	tmp_high = vld1q_f32(input + 12);
				362	tmp_high1 = vld1q_f32(input + 16);
				363
				364	l_nc.val[2] = tmp_low;
				365	m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1);
				366	r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2);
				367
				368	tmp_low = tmp_high;
				369	tmp_high = tmp_high1;
				370
				371	l_nc.val[3] = tmp_low;
				372	m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1);
				373	r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2);
				374
				375	// mc > nc.val[0], mc > nc.val[1], mc > nc.val[2]
				376	mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0]));
				377	mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0]));
				378	mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0]));
				379	mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1]));
				380	mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1]));
				381	mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1]));
				382	mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2]));
				383	mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2]));
				384	mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2]));
				385	mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3]));
				386	mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3]));
				387	mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3]));
				388
				389	static const float32x4_t zero = vdupq_n_f32(0.f);
				390
				391	// Store
				392	vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero));
				393	vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero));
				394	vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero));
				395	vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero));
				396	}
				397
				398	inline void non_maxima_suppression3x3_U8_U8(const void __restrict input_ptr, void __restrict output_ptr, const uint32_t input_stride)
				399	{
				400	auto input = static_cast<const uint8_t *__restrict>(input_ptr) - 1;
				401	const auto output = static_cast<uint8_t *__restrict>(output_ptr);
				402
				403	// Get centre scores
				404	const uint8x16_t vc = vld1q_u8(input + 1);
				405
				406	// Neighboring pixels
				407	uint8x16_t l_nc{};
				408	uint8x16_t m_nc{};
				409	uint8x16_t r_nc{};
				410
				411	input -= input_stride;
				412
				413	// Row0
				414	l_nc = vld1q_u8(input);
				415	m_nc = vld1q_u8(input + 1);
				416	r_nc = vld1q_u8(input + 2);
				417
				418	// mc >= l_nc, mc >= m_nc, mc >= r_nc
				419	uint8x16_t mask = vcgeq_u8(vc, l_nc);
				420	mask = vandq_u8(mask, vcgeq_u8(vc, m_nc));
				421	mask = vandq_u8(mask, vcgeq_u8(vc, r_nc));
				422
				423	input += input_stride;
				424
				425	// Row1
				426	l_nc = vld1q_u8(input);
				427	r_nc = vld1q_u8(input + 2);
				428
				429	// mc >= l_nc, mc > r_nc
				430	mask = vandq_u8(mask, vcgeq_u8(vc, l_nc));
				431	mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
				432
				433	input += input_stride;
				434
				435	// Row2
				436	l_nc = vld1q_u8(input);
				437	m_nc = vld1q_u8(input + 1);
				438	r_nc = vld1q_u8(input + 2);
				439
				440	// mc > l_nc, mc > m_nc, mc > r_nc
				441	mask = vandq_u8(mask, vcgtq_u8(vc, l_nc));
				442	mask = vandq_u8(mask, vcgtq_u8(vc, m_nc));
				443	mask = vandq_u8(mask, vcgtq_u8(vc, r_nc));
				444
				445	static const uint8x16_t zero = vdupq_n_u8(0);
				446
				447	// Store
				448	vst1q_u8(output, vbslq_u8(mask, vc, zero));
				449	}
				450	} // namespace
				451
				452	NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel()
				453	: _func(nullptr), _input(nullptr), _output(nullptr)
				454	{
				455	}
				456
				457	BorderSize NENonMaximaSuppression3x3Kernel::border_size() const
				458	{
				459	return BorderSize(1);
				460	}
				461
				462	void NENonMaximaSuppression3x3Kernel::configure(const ITensor input, ITensor output, bool border_undefined)
				463	{
				464	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
				465	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
				466	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				467
				468	_input = input;
				469	_output = output;
				470
				471	if(input->info()->data_type() == DataType::U8)
				472	{
				473	_func = &non_maxima_suppression3x3_U8_U8;
				474	}
				475	else
				476	{
				477	_func = &non_maxima_suppression3x3_FLOAT_FLOAT;
				478	}
				479
				480	constexpr unsigned int num_elems_processed_per_iteration = 16;
				481	const unsigned int num_elems_read_per_iteration = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3);
				482	constexpr unsigned int num_elems_written_per_iteration = 16;
				483	constexpr unsigned int num_rows_read_per_iteration = 3;
				484
				485	// Configure kernel window
				486	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				487	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				488
				489	update_window_and_padding(win,
				490	AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
				491	output_access);
				492
				493	output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
				494
				495	INEKernel::configure(win);
				496	}
				497
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	498	void NENonMaximaSuppression3x3Kernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	499	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	500	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	501	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				502	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				503	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				504	Iterator input(_input, window);
				505	Iterator output(_output, window);
				506
				507	const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type());
				508
				509	execute_window_loop(window, [&](const Coordinates & id)
				510	{
				511	_func(input.ptr(), output.ptr(), input_stride);
				512	},
				513	input, output);
				514	}