Blame - src/core/NEON/kernels/NECannyEdgeKernel.cpp - ml/ComputeLibrary

blob: 7a2bf20c0408daf7ce9b19cc1b2fa14208bbd7c2 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	2	* Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	24	#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/TensorInfo.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Utils.h"
				32	#include "arm_compute/core/Validate.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	33	#include "src/core/AccessWindowStatic.h"
				34	#include "src/core/helpers/AutoConfiguration.h"
				35	#include "src/core/helpers/AutoConfiguration.h"
				36	#include "src/core/helpers/WindowHelpers.h"
				37	#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	38
				39	#include <arm_neon.h>
				40	#include <cstddef>
				41	#include <cstdint>
				42	#include <tuple>
				43
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	44	namespace arm_compute
				45	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	46	namespace
				47	{
				48	constexpr int NO_EDGE = 0;
				49	constexpr int EDGE = 255;
				50	constexpr int MAYBE = 127;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
				53	{
				54	// Constant use for evaluating score1 and score3
				55	static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
				56	static const float32x4_t zero = vdupq_n_f32(0.0f);
				57	static const float32x4_t one = vdupq_n_f32(1.0f);
				58	static const float32x4_t two = vdupq_n_f32(2.0f);
				59	static const float32x4_t three = vdupq_n_f32(3.0f);
				60
				61	// Score0: (1, 0)
				62	const float32x4x2_t score0 =
				63	{
				64	{
				65	vabsq_f32(gx.val[0]),
				66	vabsq_f32(gx.val[1])
				67	}
				68	};
				69
				70	// Score2: ( 0, 1 )
				71	const float32x4x2_t score2 =
				72	{
				73	{
				74	vabsq_f32(gy.val[0]),
				75	vabsq_f32(gy.val[1])
				76	}
				77	};
				78
				79	// Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
				80	float32x4x2_t score1 =
				81	{
				82	{
				83	vmulq_f32(gy.val[0], const45),
				84	vmulq_f32(gy.val[1], const45)
				85	}
				86	};
				87
				88	float32x4x2_t score3 = score1;
				89
				90	score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
				91	score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
				92	score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
				93	score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
				94
				95	score1.val[0] = vabsq_f32(score1.val[0]);
				96	score1.val[1] = vabsq_f32(score1.val[1]);
				97	score3.val[0] = vabsq_f32(score3.val[0]);
				98	score3.val[1] = vabsq_f32(score3.val[1]);
				99
				100	float32x4x2_t phase =
				101	{
				102	{
				103	zero,
				104	zero
				105	}
				106	};
				107
				108	float32x4x2_t old_score = score0;
				109
				110	// score1 > old_score?
				111	uint32x4x2_t mask =
				112	{
				113	{
				114	vcgtq_f32(score1.val[0], old_score.val[0]),
				115	vcgtq_f32(score1.val[1], old_score.val[1])
				116	}
				117	};
				118
				119	phase.val[0] = vbslq_f32(mask.val[0], one, phase.val[0]);
				120	phase.val[1] = vbslq_f32(mask.val[1], one, phase.val[1]);
				121	old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
				122	old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
				123
				124	// score2 > old_score?
				125	mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
				126	mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
				127
				128	phase.val[0] = vbslq_f32(mask.val[0], two, phase.val[0]);
				129	phase.val[1] = vbslq_f32(mask.val[1], two, phase.val[1]);
				130	old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
				131	old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
				132
				133	// score3 > old_score?
				134	mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
				135	mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
				136
				137	phase.val[0] = vbslq_f32(mask.val[0], three, phase.val[0]);
				138	phase.val[1] = vbslq_f32(mask.val[1], three, phase.val[1]);
				139	old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
				140	old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
				141
				142	// Convert from float32x4_t to uint8x8_t
				143	return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
				144	vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
				145	}
				146
				147	/* Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
				148	* 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				149	*
				150	* @param[in] gx Gx component
				151	* @param[in] gy Gy component
				152	*
				153	* @return quantized phase for 8 pixels
				154	*/
				155	inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
				156	{
				157	// Convert to float
				158	const float32x4x2_t gx_f32 =
				159	{
				160	{
				161	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
				162	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
				163	}
				164	};
				165
				166	const float32x4x2_t gy_f32 =
				167	{
				168	{
				169	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
				170	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
				171	}
				172	};
				173
				174	return phase_quantization(gx_f32, gy_f32);
				175	}
				176
				177	/* Computes the gradient phase if gradient_size = 7. The output is quantized.
				178	* 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				179	*
				180	* @param[in] gx Gx component
				181	* @param[in] gy Gy component
				182	*
				183	* @return quantized phase for 8 pixels
				184	*/
				185	inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
				186	{
				187	// Convert to float
				188	const float32x4x2_t gx_f32 =
				189	{
				190	{
				191	vcvtq_f32_s32(gx.val[0]),
				192	vcvtq_f32_s32(gx.val[1])
				193	}
				194	};
				195
				196	const float32x4x2_t gy_f32 =
				197	{
				198	{
				199	vcvtq_f32_s32(gy.val[0]),
				200	vcvtq_f32_s32(gy.val[1])
				201	}
				202	};
				203
				204	return phase_quantization(gx_f32, gy_f32);
				205	}
				206
				207	/* Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
				208	*
				209	* @param[in] gx Gx component
				210	* @param[in] gy Gy component
				211	*
				212	* @return magnitude for 8 pixels
				213	*/
				214	inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
				215	{
				216	return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
				217	vreinterpretq_u16_s16(vabsq_s16(gy)));
				218	}
				219
				220	/* Computes the magnitude using the L1-norm type if gradient_size = 7
				221	*
				222	* @param[in] gx Gx component
				223	* @param[in] gy Gy component
				224	*
				225	* @return magnitude for 8 pixels
				226	*/
				227	inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
				228	{
				229	const uint32x4x2_t gx_abs =
				230	{
				231	{
				232	vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
				233	vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
				234	}
				235	};
				236
				237	const uint32x4x2_t gy_abs =
				238	{
				239	{
				240	vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
				241	vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
				242	}
				243	};
				244
				245	const uint32x4x2_t output =
				246	{
				247	{
				248	vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
				249	vaddq_u32(gx_abs.val[1], gy_abs.val[1])
				250	}
				251	};
				252
				253	return output;
				254	}
				255
				256	inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
				257	{
				258	// x^2 ...
				259	float32x4x2_t magnitude =
				260	{
				261	{
				262	vmulq_f32(gx.val[0], gx.val[0]),
				263	vmulq_f32(gx.val[1], gx.val[1])
				264	}
				265	};
				266
				267	// ... + y^2
				268	magnitude.val[0] = vmlaq_f32(magnitude.val[0], gy.val[0], gy.val[0]);
				269	magnitude.val[1] = vmlaq_f32(magnitude.val[1], gy.val[1], gy.val[1]);
				270
				271	// sqrt(...)
				272	magnitude.val[0] = vmulq_f32(vrsqrteq_f32(magnitude.val[0]), magnitude.val[0]);
				273	magnitude.val[1] = vmulq_f32(vrsqrteq_f32(magnitude.val[1]), magnitude.val[1]);
				274
				275	return magnitude;
				276	}
				277
				278	/* Computes the magnitude using L2-norm if gradient_size = 3 or 5
				279	*
				280	* @param[in] gx Gx component
				281	* @param[in] gy Gy component
				282	*
				283	* @return magnitude for 8 pixels
				284	*/
				285	inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
				286	{
				287	// Compute magnitude using L2 normalization
				288	const float32x4x2_t gx2 =
				289	{
				290	{
				291	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
				292	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
				293	}
				294	};
				295
				296	const float32x4x2_t gy2 =
				297	{
				298	{
				299	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
				300	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
				301	}
				302	};
				303
				304	const float32x4x2_t magnitude = mag_l2(gx2, gy2);
				305
				306	// Store magnitude - Convert to uint16x8
				307	return vcombine_u16(vmovn_u32(vcvtq_u32_f32(magnitude.val[0])),
				308	vmovn_u32(vcvtq_u32_f32(magnitude.val[1])));
				309	}
				310
				311	/* Computes the magnitude using L2-norm if gradient_size = 7
				312	*
				313	* @param[in] gx Gx component
				314	* @param[in] gy Gy component
				315	*
				316	* @return magnitude for 8 pixels
				317	*/
				318	inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
				319	{
				320	// Compute magnitude using L2 normalization
				321	float32x4x2_t gx2 =
				322	{
				323	{
				324	vcvtq_f32_s32(gx.val[0]),
				325	vcvtq_f32_s32(gx.val[1])
				326	}
				327	};
				328
				329	float32x4x2_t gy2 =
				330	{
				331	{
				332	vcvtq_f32_s32(gy.val[0]),
				333	vcvtq_f32_s32(gy.val[1])
				334	}
				335	};
				336
				337	const float32x4x2_t magnitude = mag_l2(gx2, gy2);
				338	const uint32x4x2_t mag32 =
				339	{
				340	{
				341	vcvtq_u32_f32(magnitude.val[0]),
				342	vcvtq_u32_f32(magnitude.val[1])
				343	}
				344	};
				345
				346	return mag32;
				347	}
				348
				349	/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
				350	*
				351	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S16
				352	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S16
				353	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
				354	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8
				355	*/
				356	void mag_phase_l1norm_S16_S16_U16_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				357	{
				358	const auto gx = static_cast<const int16_t *__restrict>(gx_ptr);
				359	const auto gy = static_cast<const int16_t *__restrict>(gy_ptr);
				360	const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
				361	const auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				362
				363	const int16x8x4_t gx_val =
				364	{
				365	{
				366	vld1q_s16(gx),
				367	vld1q_s16(gx + 8),
				368	vld1q_s16(gx + 16),
				369	vld1q_s16(gx + 24)
				370	}
				371	};
				372
				373	const int16x8x4_t gy_val =
				374	{
				375	{
				376	vld1q_s16(gy),
				377	vld1q_s16(gy + 8),
				378	vld1q_s16(gy + 16),
				379	vld1q_s16(gy + 24)
				380	}
				381	};
				382
				383	// Compute and store phase
				384	vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
				385	vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
				386	vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
				387	vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
				388
				389	// Compute ans store magnitude using L1 normalization
				390	vst1q_u16(magnitude + 0, mag_l1_S16_S16(gx_val.val[0], gy_val.val[0]));
				391	vst1q_u16(magnitude + 8, mag_l1_S16_S16(gx_val.val[1], gy_val.val[1]));
				392	vst1q_u16(magnitude + 16, mag_l1_S16_S16(gx_val.val[2], gy_val.val[2]));
				393	vst1q_u16(magnitude + 24, mag_l1_S16_S16(gx_val.val[3], gy_val.val[3]));
				394	}
				395
				396	/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
				397	*
				398	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S16
				399	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S16
				400	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
				401	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8
				402	*/
				403	void mag_phase_l2norm_S16_S16_U16_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				404	{
				405	const auto gx = static_cast<const int16_t *__restrict>(gx_ptr);
				406	const auto gy = static_cast<const int16_t *__restrict>(gy_ptr);
				407	const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
				408	const auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				409
				410	const int16x8x4_t gx_val =
				411	{
				412	{
				413	vld1q_s16(gx),
				414	vld1q_s16(gx + 8),
				415	vld1q_s16(gx + 16),
				416	vld1q_s16(gx + 24)
				417	}
				418	};
				419
				420	const int16x8x4_t gy_val =
				421	{
				422	{
				423	vld1q_s16(gy),
				424	vld1q_s16(gy + 8),
				425	vld1q_s16(gy + 16),
				426	vld1q_s16(gy + 24)
				427	}
				428	};
				429
				430	// Compute and store phase
				431	vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
				432	vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
				433	vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
				434	vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
				435
				436	// Compute and store magnitude using L2 normalization
				437	vst1q_u16(magnitude + 0, mag_l2_S16_S16(gx_val.val[0], gy_val.val[0]));
				438	vst1q_u16(magnitude + 8, mag_l2_S16_S16(gx_val.val[1], gy_val.val[1]));
				439	vst1q_u16(magnitude + 16, mag_l2_S16_S16(gx_val.val[2], gy_val.val[2]));
				440	vst1q_u16(magnitude + 24, mag_l2_S16_S16(gx_val.val[3], gy_val.val[3]));
				441	}
				442
				443	/* Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
				444	*
				445	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S32
				446	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S32
				447	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
				448	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type support U8
				449	*/
				450	void mag_phase_l1norm_S32_S32_U32_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				451	{
				452	auto gx = static_cast<const int32_t *__restrict>(gx_ptr);
				453	auto gy = static_cast<const int32_t *__restrict>(gy_ptr);
				454	auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
				455	auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				456
				457	// Process low and high part
				458	for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
				459	{
				460	const int32x4x2_t gx0 =
				461	{
				462	{
				463	vld1q_s32(gx + 0),
				464	vld1q_s32(gx + 4)
				465	}
				466	};
				467
				468	const int32x4x2_t gx1 =
				469	{
				470	{
				471	vld1q_s32(gx + 8),
				472	vld1q_s32(gx + 12)
				473	}
				474	};
				475
				476	const int32x4x2_t gy0 =
				477	{
				478	{
				479	vld1q_s32(gy + 0),
				480	vld1q_s32(gy + 4)
				481	}
				482	};
				483
				484	const int32x4x2_t gy1 =
				485	{
				486	{
				487	vld1q_s32(gy + 8),
				488	vld1q_s32(gy + 12)
				489	}
				490	};
				491
				492	// Compute and store phase
				493	vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
				494	vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
				495
				496	// Compute magnitude using L1 normalization
				497	const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
				498	const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
				499
				500	// Store magnitude
				501	vst1q_u32(magnitude + 0, mag0.val[0]);
				502	vst1q_u32(magnitude + 4, mag0.val[1]);
				503	vst1q_u32(magnitude + 8, mag1.val[0]);
				504	vst1q_u32(magnitude + 12, mag1.val[1]);
				505	}
				506	}
				507
				508	/* Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
				509	*
				510	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S32
				511	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S32
				512	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
				513	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8
				514	*/
				515	void mag_phase_l2norm_S32_S32_U32_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				516	{
				517	auto gx = static_cast<const int32_t *__restrict>(gx_ptr);
				518	auto gy = static_cast<const int32_t *__restrict>(gy_ptr);
				519	auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
				520	auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				521
				522	// Process low and high part
				523	for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
				524	{
				525	const int32x4x2_t gx0 =
				526	{
				527	{
				528	vld1q_s32(gx + 0),
				529	vld1q_s32(gx + 4)
				530	}
				531	};
				532
				533	const int32x4x2_t gx1 =
				534	{
				535	{
				536	vld1q_s32(gx + 8),
				537	vld1q_s32(gx + 12)
				538	}
				539	};
				540
				541	const int32x4x2_t gy0 =
				542	{
				543	{
				544	vld1q_s32(gy + 0),
				545	vld1q_s32(gy + 4)
				546	}
				547	};
				548
				549	const int32x4x2_t gy1 =
				550	{
				551	{
				552	vld1q_s32(gy + 8),
				553	vld1q_s32(gy + 12)
				554	}
				555	};
				556
				557	// Compute and store phase
				558	vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
				559	vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
				560
				561	// Compute magnitude using L2 normalization
				562	const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
				563	const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
				564
				565	// Store magnitude
				566	vst1q_u32(magnitude + 0, mag0.val[0]);
				567	vst1q_u32(magnitude + 4, mag0.val[1]);
				568	vst1q_u32(magnitude + 8, mag1.val[0]);
				569	vst1q_u32(magnitude + 12, mag1.val[1]);
				570	}
				571	}
				572
				573	/* Computes non-maxima suppression and hysteresis when the gradient size = 3 or 5
				574	*
				575	* @param[in] magnitude_ptr Pointer to source image. Magnitude. Data type supported U16
				576	* @param[in] phase_ptr Pointer to source image. Quantized phase. Data type supported U8
				577	* @param[out] output_ptr Pointer to output image. Data type supported U8
				578	* @param[in] stride_mag Stride of magnitude image
				579	* @param[in] lower_thr Lower threshold used for the hysteresis
				580	* @param[in] upper_thr Upper threshold used for the hysteresis
				581	*/
				582	void non_max_suppression_U16_U8_U8(const void __restrict magnitude_ptr, const void __restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
				583	const int32_t upper_thr)
				584	{
				585	const auto magnitude = static_cast<const uint16_t *__restrict>(magnitude_ptr);
				586	const auto phase = static_cast<const uint8_t *__restrict>(phase_ptr);
				587	const auto output = static_cast<uint8_t *__restrict>(output_ptr);
				588
				589	// Get magnitude and phase of the centre pixels
				590	uint16x8_t mc = vld1q_u16(magnitude);
				591
				592	// Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				593	const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
				594
				595	// 0 degree
				596	const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1);
				597	const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1);
				598	uint16x8_t mask0 = vceqq_u16(pc16, vdupq_n_u16(0));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	599	mask0 = vandq_u16(mask0, vcgtq_u16(mc, mk0_0));
				600	mask0 = vandq_u16(mask0, vcgtq_u16(mc, mk0_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	601
				602	// 45 degree
				603	const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1);
				604	const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1);
				605	uint16x8_t mask1 = vceqq_u16(pc16, vdupq_n_u16(1));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	606	mask1 = vandq_u16(mask1, vcgtq_u16(mc, mk45_0));
				607	mask1 = vandq_u16(mask1, vcgtq_u16(mc, mk45_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	608
				609	// 90 degree
				610	const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag);
				611	const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag);
				612	uint16x8_t mask2 = vceqq_u16(pc16, vdupq_n_u16(2));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	613	mask2 = vandq_u16(mask2, vcgtq_u16(mc, mk90_0));
				614	mask2 = vandq_u16(mask2, vcgtq_u16(mc, mk90_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	615
				616	// 135 degree
				617	const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1);
				618	const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1);
				619	uint16x8_t mask3 = vceqq_u16(pc16, vdupq_n_u16(3));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	620	mask3 = vandq_u16(mask3, vcgtq_u16(mc, mk135_0));
				621	mask3 = vandq_u16(mask3, vcgtq_u16(mc, mk135_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	622
				623	// Merge masks
				624	mask0 = vorrq_u16(mask0, mask1);
				625	mask2 = vorrq_u16(mask2, mask3);
				626	mask0 = vorrq_u16(mask0, mask2);
				627
				628	mc = vbslq_u16(mask0, mc, vdupq_n_u16(0));
				629
				630	// mc > upper_thr
				631	mask0 = vcgtq_u16(mc, vdupq_n_u16(upper_thr));
				632
				633	// mc <= lower_thr
				634	mask1 = vcleq_u16(mc, vdupq_n_u16(lower_thr));
				635
				636	// mc <= upper_thr && mc > lower_thr
				637	mask2 = vcleq_u16(mc, vdupq_n_u16(upper_thr));
				638	mask2 = vandq_u16(mask2, vcgtq_u16(mc, vdupq_n_u16(lower_thr)));
				639
				640	mc = vbslq_u16(mask0, vdupq_n_u16(EDGE), mc);
				641	mc = vbslq_u16(mask1, vdupq_n_u16(NO_EDGE), mc);
				642	mc = vbslq_u16(mask2, vdupq_n_u16(MAYBE), mc);
				643
				644	vst1_u8(output, vmovn_u16(mc));
				645	}
				646
				647	inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
				648	{
				649	// Phase for 4 pixel
				650	const uint32x4_t pc32 = vmovl_u16(pc);
				651
				652	// Get magnitude for 4 pixel
				653	uint32x4_t mc = vld1q_u32(input);
				654
				655	// Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				656	// 0 degree
				657	const uint32x4_t mk0_0 = vld1q_u32(input - 1);
				658	const uint32x4_t mk0_1 = vld1q_u32(input + 1);
				659	uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	660	mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
				661	mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	662
				663	// 45 degree
				664	const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1);
				665	const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1);
				666	uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	667	mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
				668	mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	669
				670	// 90 degree
				671	const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag);
				672	const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag);
				673	uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	674	mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
				675	mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	676
				677	// 135 degree
				678	const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1);
				679	const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1);
				680	uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	681	mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
				682	mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	683
				684	// Merge masks
				685	mask0 = vorrq_u32(mask0, mask1);
				686	mask2 = vorrq_u32(mask2, mask3);
				687	mask0 = vorrq_u32(mask0, mask2);
				688
				689	mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
				690
				691	// mc > upper_thr
				692	mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
				693
				694	// mc <= lower_thr
				695	mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
				696
				697	// mc <= upper_thr && mc > lower_thr
				698	mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
				699	mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
				700
				701	mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
				702	mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
				703	mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
				704
				705	return vmovn_u32(mc);
				706	}
				707
				708	/* Computes non-maxima suppression and hysteresis when the gradient_size = 7
				709	*
				710	* @param[in] magnitude_ptr Pointer to source image. Magnitude. Data type supported U32
				711	* @param[in] phase_ptr Pointer to source image. Quantized phase. Data type supported U8
				712	* @param[out] output_ptr Pointer to destination image. Data type supported U8
				713	* @param[in] stride_mag Stride of magnitude image
				714	* @param[in] lower_thr Lower threshold used for the hysteresis
				715	* @param[in] upper_thr Upper threshold used for the hysteresis
				716	*/
				717	void non_max_suppression_U32_U8_U8(const void __restrict magnitude_ptr, const void __restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
				718	const int32_t upper_thr)
				719	{
				720	const auto magnitude = static_cast<const uint32_t *__restrict>(magnitude_ptr);
				721	const auto phase = static_cast<const uint8_t *__restrict>(phase_ptr);
				722	const auto output = static_cast<uint8_t *__restrict>(output_ptr);
				723
				724	// Get phase for 8 pixel
				725	const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
				726
				727	// Compute non maxima suppression
				728	const uint16x4x2_t res =
				729	{
				730	{
				731	non_max_U32_helper(magnitude, vget_low_u16(pc16), stride_mag, lower_thr, upper_thr),
				732	non_max_U32_helper(magnitude + 4, vget_high_u16(pc16), stride_mag, lower_thr, upper_thr)
				733	}
				734	};
				735
				736	// Store result
				737	vst1_u8(output, vmovn_u16(vcombine_u16(res.val[0], res.val[1])));
				738	}
				739
				740	/* Computes edge tracing when is called by edge_trace_U8_U8 recursively
				741	*
				742	* @param[in] input Pointer to source image. Data type supported U8
				743	* @param[out] output Pointer to destination image. Data type supported U8
				744	* @param[in] input_stride Stride of the input image
				745	* @param[in] output_stride Stride of the output image
				746	*/
				747	void edge_trace_recursive_U8_U8(uint8_t __restrict input, uint8_t __restrict output, const int32_t input_stride, const int32_t output_stride)
				748	{
				749	// Look for MAYBE pixels in 8 directions
				750	*output = EDGE;
				751
				752	// (-1, 0)
				753	uint8_t pixel = *(input - 1);
				754
				755	if(pixel == MAYBE)
				756	{
				757	// Touched a MAYBE point. MAYBE becomes EDGE
				758	*(input - 1) = EDGE;
				759
				760	edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
				761	}
				762
				763	// (+1, 0)
				764	pixel = *(input + 1);
				765
				766	if(pixel == MAYBE)
				767	{
				768	// Touched a MAYBE point. MAYBE becomes EDGE
				769	*(input + 1) = EDGE;
				770
				771	edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
				772	}
				773
				774	input -= input_stride;
				775	output -= output_stride;
				776
				777	// (-1, -1)
				778	pixel = *(input - 1);
				779
				780	if(pixel == MAYBE)
				781	{
				782	// Touched a MAYBE point. MAYBE becomes EDGE
				783	*(input - 1) = EDGE;
				784
				785	edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
				786	}
				787
				788	// (0, -1)
				789	pixel = *input;
				790
				791	if(pixel == MAYBE)
				792	{
				793	// Touched a MAYBE point. MAYBE becomes EDGE
				794	*input = EDGE;
				795
				796	edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
				797	}
				798
				799	// (+1, -1)
				800	pixel = *(input + 1);
				801
				802	if(pixel == MAYBE)
				803	{
				804	// Touched a MAYBE point. MAYBE becomes EDGE
				805	*(input + 1) = EDGE;
				806
				807	edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
				808	}
				809
				810	input += input_stride * 2;
				811	output += output_stride * 2;
				812
				813	// (-1, +1)
				814	pixel = *(input - 1);
				815
				816	if(pixel == MAYBE)
				817	{
				818	// Touched a MAYBE point. MAYBE becomes EDGE
				819	*(input - 1) = EDGE;
				820
				821	edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
				822	}
				823
				824	// (0, +1)
				825	pixel = *input;
				826
				827	if(pixel == MAYBE)
				828	{
				829	// Touched a MAYBE point. MAYBE becomes EDGE
				830	*input = EDGE;
				831
				832	edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
				833	}
				834
				835	// (+1, +1)
				836	pixel = *(input + 1);
				837
				838	if(pixel == MAYBE)
				839	{
				840	// Touched a MAYBE point. MAYBE becomes EDGE
				841	*(input + 1) = EDGE;
				842
				843	edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
				844	}
				845	}
				846
				847	/* Computes edge tracing
				848	*
				849	* @param[in] input Pointer to source image. Data type supported U8
				850	* @param[out] output Pointer to destination image. Data type supported U8
				851	* @param[in] input_stride Stride of the input image
				852	* @param[in] output_stride Stride of the output image
				853	*/
				854	void edge_trace_U8_U8(uint8_t __restrict input, uint8_t __restrict output, const int32_t input_stride, const int32_t output_stride)
				855	{
				856	if(*input == NO_EDGE)
				857	{
				858	*output = NO_EDGE;
				859	}
				860	// Check if EDGE and not yet touched
				861	else if((input == EDGE) && (output == NO_EDGE))
				862	{
				863	edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
				864	}
				865	}
				866	} // namespace
				867
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	868	NEGradientKernel::~NEGradientKernel() = default;
				869
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	870	NEGradientKernel::NEGradientKernel()
				871	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				872	{
				873	}
				874
				875	void NEGradientKernel::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase, int32_t norm_type)
				876	{
				877	ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
				878
				879	set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
				880	set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
				881
				882	Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
				883	set_format_if_unknown(*magnitude->info(), magnitude_format);
				884	set_format_if_unknown(*phase->info(), Format::U8);
				885
				886	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
				887	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
				888	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
				889	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
				890	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				891	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
				892	ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
				893
				894	_gx = gx;
				895	_gy = gy;
				896	_magnitude = magnitude;
				897	_phase = phase;
				898
				899	if(_gx->info()->data_type() == DataType::S16)
				900	{
				901	if(norm_type == 1)
				902	{
				903	_func = &mag_phase_l1norm_S16_S16_U16_U8;
				904	}
				905	else
				906	{
				907	_func = &mag_phase_l2norm_S16_S16_U16_U8;
				908	}
				909	}
				910	else
				911	{
				912	if(norm_type == 1)
				913	{
				914	_func = &mag_phase_l1norm_S32_S32_U32_U8;
				915	}
				916	else
				917	{
				918	_func = &mag_phase_l2norm_S32_S32_U32_U8;
				919	}
				920	}
				921
				922	constexpr unsigned int num_elems_processed_per_iteration = 32;
				923
				924	// Configure kernel window
				925	Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
				926
				927	AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
				928	AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
				929	AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
				930	AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
				931
Michalis Spyrou	ebdde65	2019-07-08 11:52:46 +0100	[diff] [blame]	932	ARM_COMPUTE_UNUSED(update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	933
				934	mag_access.set_valid_region(win, _gx->info()->valid_region());
				935	phase_access.set_valid_region(win, _gx->info()->valid_region());
				936
				937	INEKernel::configure(win);
				938	}
				939
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	940	void NEGradientKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	941	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	942	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	943	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				944	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				945	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				946	Iterator gx(_gx, window);
				947	Iterator gy(_gy, window);
				948	Iterator magnitude(_magnitude, window);
				949	Iterator phase(_phase, window);
				950
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	951	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	952	{
				953	(*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
				954	},
				955	gx, gy, magnitude, phase);
				956	}
				957
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	958	NEEdgeNonMaxSuppressionKernel::~NEEdgeNonMaxSuppressionKernel() = default;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	959	NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
				960	: _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
				961	{
				962	}
				963
				964	BorderSize NEEdgeNonMaxSuppressionKernel::border_size() const
				965	{
				966	return BorderSize(1);
				967	}
				968
				969	void NEEdgeNonMaxSuppressionKernel::configure(const ITensor magnitude, const ITensor phase, ITensor *output,
				970	int32_t upper_thr, int32_t lower_thr, bool border_undefined)
				971	{
				972	ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
				973
				974	set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
				975
				976	set_format_if_unknown(*phase->info(), Format::U8);
				977	set_format_if_unknown(*output->info(), Format::U8);
				978
				979	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
				980	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
				981	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				982	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
				983	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
				984
				985	_magnitude = magnitude;
				986	_phase = phase;
				987	_output = output;
				988
				989	switch(_magnitude->info()->data_type())
				990	{
				991	case DataType::U16:
				992	_func = &non_max_suppression_U16_U8_U8;
				993	break;
				994	case DataType::U32:
				995	_func = &non_max_suppression_U32_U8_U8;
				996	break;
				997	default:
				998	ARM_COMPUTE_ERROR("Unsupported data type!");
				999	}
				1000
				1001	// Set thresholds
				1002	_lower_thr = lower_thr;
				1003	_upper_thr = upper_thr;
				1004
				1005	constexpr unsigned int num_elems_processed_per_iteration = 8;
				1006	constexpr unsigned int num_elems_read_per_iteration = 10;
				1007	constexpr unsigned int num_rows_read_per_iteration = 3;
				1008
				1009	// Configure kernel window
				1010	Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1011
				1012	AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
				1013	AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
				1014	AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
				1015
				1016	update_window_and_padding(win, mag_access, phase_access, output_access);
				1017
				1018	output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
				1019
				1020	INEKernel::configure(win);
				1021	}
				1022
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1023	void NEEdgeNonMaxSuppressionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1024	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1025	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1026	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1027	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1028	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				1029	Iterator magnitude(_magnitude, window);
				1030	Iterator phase(_phase, window);
				1031	Iterator output(_output, window);
				1032
				1033	const size_t input1_stride = _magnitude->info()->strides_in_bytes()[1];
				1034	const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
				1035
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1036	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1037	{
				1038	(*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
				1039	},
				1040	magnitude, phase, output);
				1041	}
				1042
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	1043	NEEdgeTraceKernel::~NEEdgeTraceKernel() = default;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1044	NEEdgeTraceKernel::NEEdgeTraceKernel()
				1045	: _input(nullptr), _output(nullptr)
				1046	{
				1047	}
				1048
				1049	BorderSize NEEdgeTraceKernel::border_size() const
				1050	{
				1051	return BorderSize(1);
				1052	}
				1053
				1054	bool NEEdgeTraceKernel::is_parallelisable() const
				1055	{
				1056	return false;
				1057	}
				1058
				1059	void NEEdgeTraceKernel::configure(ITensor input, ITensor output)
				1060	{
				1061	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
				1062
				1063	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1064
				1065	set_format_if_unknown(*input->info(), Format::U8);
				1066	set_format_if_unknown(*output->info(), Format::U8);
				1067
				1068	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1069	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				1070	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
				1071	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				1072
				1073	_input = input;
				1074	_output = output;
				1075
				1076	constexpr unsigned int num_elems_processed_per_iteration = 1;
				1077
				1078	// Configure kernel window
				1079	Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
				1080
				1081	const ValidRegion &input_valid_region = input->info()->valid_region();
				1082	const ValidRegion &output_valid_region = output->info()->valid_region();
				1083
				1084	// Reads can occur within the valid region of the input + border
				1085	AccessWindowStatic input_access(input->info(),
				1086	input_valid_region.anchor[0] - border_size().left,
				1087	input_valid_region.anchor[1] - border_size().top,
				1088	input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
				1089	input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
				1090
				1091	// Writes can occur within the valid region of the output + border
				1092	AccessWindowStatic output_access(output->info(),
				1093	output_valid_region.anchor[0] - border_size().left,
				1094	output_valid_region.anchor[1] - border_size().top,
				1095	output_valid_region.anchor[0] + output_valid_region.shape[0] + border_size().right,
				1096	output_valid_region.anchor[1] + output_valid_region.shape[1] + border_size().bottom);
				1097
				1098	update_window_and_padding(win, input_access, output_access);
				1099
				1100	output_access.set_valid_region(win, _input->info()->valid_region());
				1101
				1102	INEKernel::configure(win);
				1103	}
				1104
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1105	void NEEdgeTraceKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1106	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1107	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1108	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1109	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1110	Iterator input(_input, window);
				1111	Iterator output(_output, window);
				1112
				1113	const size_t input_stride = _input->info()->strides_in_bytes()[1];
				1114	const size_t output_stride = _output->info()->strides_in_bytes()[1];
				1115
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1116	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1117	{
				1118	edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
				1119	},
				1120	input, output);
				1121	}
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame^]	1122	} // namespace arm_compute