Blame - src/core/NEON/kernels/NECannyEdgeKernel.cpp - ml/ComputeLibrary

blob: 0278bb08e1a6133c4216b4330c1ec33fca37d522 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Michele Di Giorgio	d9eaf61	2020-07-08 11:12:57 +0100	[diff] [blame]	2	* Copyright (c) 2016-2019 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
				25
				26	#include "arm_compute/core/AccessWindowStatic.h"
				27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Types.h"
				32	#include "arm_compute/core/Utils.h"
				33	#include "arm_compute/core/Validate.h"
				34
				35	#include <arm_neon.h>
				36	#include <cstddef>
				37	#include <cstdint>
				38	#include <tuple>
				39
				40	using namespace arm_compute;
				41
				42	namespace arm_compute
				43	{
				44	class Coordinates;
				45	} // namespace arm_compute
				46
				47	namespace
				48	{
				49	constexpr int NO_EDGE = 0;
				50	constexpr int EDGE = 255;
				51	constexpr int MAYBE = 127;
				52	} // namespace
				53
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	54	namespace
				55	{
				56	inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
				57	{
				58	// Constant use for evaluating score1 and score3
				59	static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
				60	static const float32x4_t zero = vdupq_n_f32(0.0f);
				61	static const float32x4_t one = vdupq_n_f32(1.0f);
				62	static const float32x4_t two = vdupq_n_f32(2.0f);
				63	static const float32x4_t three = vdupq_n_f32(3.0f);
				64
				65	// Score0: (1, 0)
				66	const float32x4x2_t score0 =
				67	{
				68	{
				69	vabsq_f32(gx.val[0]),
				70	vabsq_f32(gx.val[1])
				71	}
				72	};
				73
				74	// Score2: ( 0, 1 )
				75	const float32x4x2_t score2 =
				76	{
				77	{
				78	vabsq_f32(gy.val[0]),
				79	vabsq_f32(gy.val[1])
				80	}
				81	};
				82
				83	// Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
				84	float32x4x2_t score1 =
				85	{
				86	{
				87	vmulq_f32(gy.val[0], const45),
				88	vmulq_f32(gy.val[1], const45)
				89	}
				90	};
				91
				92	float32x4x2_t score3 = score1;
				93
				94	score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
				95	score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
				96	score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
				97	score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
				98
				99	score1.val[0] = vabsq_f32(score1.val[0]);
				100	score1.val[1] = vabsq_f32(score1.val[1]);
				101	score3.val[0] = vabsq_f32(score3.val[0]);
				102	score3.val[1] = vabsq_f32(score3.val[1]);
				103
				104	float32x4x2_t phase =
				105	{
				106	{
				107	zero,
				108	zero
				109	}
				110	};
				111
				112	float32x4x2_t old_score = score0;
				113
				114	// score1 > old_score?
				115	uint32x4x2_t mask =
				116	{
				117	{
				118	vcgtq_f32(score1.val[0], old_score.val[0]),
				119	vcgtq_f32(score1.val[1], old_score.val[1])
				120	}
				121	};
				122
				123	phase.val[0] = vbslq_f32(mask.val[0], one, phase.val[0]);
				124	phase.val[1] = vbslq_f32(mask.val[1], one, phase.val[1]);
				125	old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
				126	old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
				127
				128	// score2 > old_score?
				129	mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
				130	mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
				131
				132	phase.val[0] = vbslq_f32(mask.val[0], two, phase.val[0]);
				133	phase.val[1] = vbslq_f32(mask.val[1], two, phase.val[1]);
				134	old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
				135	old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
				136
				137	// score3 > old_score?
				138	mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
				139	mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
				140
				141	phase.val[0] = vbslq_f32(mask.val[0], three, phase.val[0]);
				142	phase.val[1] = vbslq_f32(mask.val[1], three, phase.val[1]);
				143	old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
				144	old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
				145
				146	// Convert from float32x4_t to uint8x8_t
				147	return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
				148	vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
				149	}
				150
				151	/* Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
				152	* 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				153	*
				154	* @param[in] gx Gx component
				155	* @param[in] gy Gy component
				156	*
				157	* @return quantized phase for 8 pixels
				158	*/
				159	inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
				160	{
				161	// Convert to float
				162	const float32x4x2_t gx_f32 =
				163	{
				164	{
				165	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
				166	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
				167	}
				168	};
				169
				170	const float32x4x2_t gy_f32 =
				171	{
				172	{
				173	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
				174	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
				175	}
				176	};
				177
				178	return phase_quantization(gx_f32, gy_f32);
				179	}
				180
				181	/* Computes the gradient phase if gradient_size = 7. The output is quantized.
				182	* 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				183	*
				184	* @param[in] gx Gx component
				185	* @param[in] gy Gy component
				186	*
				187	* @return quantized phase for 8 pixels
				188	*/
				189	inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
				190	{
				191	// Convert to float
				192	const float32x4x2_t gx_f32 =
				193	{
				194	{
				195	vcvtq_f32_s32(gx.val[0]),
				196	vcvtq_f32_s32(gx.val[1])
				197	}
				198	};
				199
				200	const float32x4x2_t gy_f32 =
				201	{
				202	{
				203	vcvtq_f32_s32(gy.val[0]),
				204	vcvtq_f32_s32(gy.val[1])
				205	}
				206	};
				207
				208	return phase_quantization(gx_f32, gy_f32);
				209	}
				210
				211	/* Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
				212	*
				213	* @param[in] gx Gx component
				214	* @param[in] gy Gy component
				215	*
				216	* @return magnitude for 8 pixels
				217	*/
				218	inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
				219	{
				220	return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
				221	vreinterpretq_u16_s16(vabsq_s16(gy)));
				222	}
				223
				224	/* Computes the magnitude using the L1-norm type if gradient_size = 7
				225	*
				226	* @param[in] gx Gx component
				227	* @param[in] gy Gy component
				228	*
				229	* @return magnitude for 8 pixels
				230	*/
				231	inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
				232	{
				233	const uint32x4x2_t gx_abs =
				234	{
				235	{
				236	vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
				237	vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
				238	}
				239	};
				240
				241	const uint32x4x2_t gy_abs =
				242	{
				243	{
				244	vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
				245	vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
				246	}
				247	};
				248
				249	const uint32x4x2_t output =
				250	{
				251	{
				252	vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
				253	vaddq_u32(gx_abs.val[1], gy_abs.val[1])
				254	}
				255	};
				256
				257	return output;
				258	}
				259
				260	inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
				261	{
				262	// x^2 ...
				263	float32x4x2_t magnitude =
				264	{
				265	{
				266	vmulq_f32(gx.val[0], gx.val[0]),
				267	vmulq_f32(gx.val[1], gx.val[1])
				268	}
				269	};
				270
				271	// ... + y^2
				272	magnitude.val[0] = vmlaq_f32(magnitude.val[0], gy.val[0], gy.val[0]);
				273	magnitude.val[1] = vmlaq_f32(magnitude.val[1], gy.val[1], gy.val[1]);
				274
				275	// sqrt(...)
				276	magnitude.val[0] = vmulq_f32(vrsqrteq_f32(magnitude.val[0]), magnitude.val[0]);
				277	magnitude.val[1] = vmulq_f32(vrsqrteq_f32(magnitude.val[1]), magnitude.val[1]);
				278
				279	return magnitude;
				280	}
				281
				282	/* Computes the magnitude using L2-norm if gradient_size = 3 or 5
				283	*
				284	* @param[in] gx Gx component
				285	* @param[in] gy Gy component
				286	*
				287	* @return magnitude for 8 pixels
				288	*/
				289	inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
				290	{
				291	// Compute magnitude using L2 normalization
				292	const float32x4x2_t gx2 =
				293	{
				294	{
				295	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
				296	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
				297	}
				298	};
				299
				300	const float32x4x2_t gy2 =
				301	{
				302	{
				303	vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
				304	vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
				305	}
				306	};
				307
				308	const float32x4x2_t magnitude = mag_l2(gx2, gy2);
				309
				310	// Store magnitude - Convert to uint16x8
				311	return vcombine_u16(vmovn_u32(vcvtq_u32_f32(magnitude.val[0])),
				312	vmovn_u32(vcvtq_u32_f32(magnitude.val[1])));
				313	}
				314
				315	/* Computes the magnitude using L2-norm if gradient_size = 7
				316	*
				317	* @param[in] gx Gx component
				318	* @param[in] gy Gy component
				319	*
				320	* @return magnitude for 8 pixels
				321	*/
				322	inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
				323	{
				324	// Compute magnitude using L2 normalization
				325	float32x4x2_t gx2 =
				326	{
				327	{
				328	vcvtq_f32_s32(gx.val[0]),
				329	vcvtq_f32_s32(gx.val[1])
				330	}
				331	};
				332
				333	float32x4x2_t gy2 =
				334	{
				335	{
				336	vcvtq_f32_s32(gy.val[0]),
				337	vcvtq_f32_s32(gy.val[1])
				338	}
				339	};
				340
				341	const float32x4x2_t magnitude = mag_l2(gx2, gy2);
				342	const uint32x4x2_t mag32 =
				343	{
				344	{
				345	vcvtq_u32_f32(magnitude.val[0]),
				346	vcvtq_u32_f32(magnitude.val[1])
				347	}
				348	};
				349
				350	return mag32;
				351	}
				352
				353	/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
				354	*
				355	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S16
				356	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S16
				357	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
				358	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8
				359	*/
				360	void mag_phase_l1norm_S16_S16_U16_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				361	{
				362	const auto gx = static_cast<const int16_t *__restrict>(gx_ptr);
				363	const auto gy = static_cast<const int16_t *__restrict>(gy_ptr);
				364	const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
				365	const auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				366
				367	const int16x8x4_t gx_val =
				368	{
				369	{
				370	vld1q_s16(gx),
				371	vld1q_s16(gx + 8),
				372	vld1q_s16(gx + 16),
				373	vld1q_s16(gx + 24)
				374	}
				375	};
				376
				377	const int16x8x4_t gy_val =
				378	{
				379	{
				380	vld1q_s16(gy),
				381	vld1q_s16(gy + 8),
				382	vld1q_s16(gy + 16),
				383	vld1q_s16(gy + 24)
				384	}
				385	};
				386
				387	// Compute and store phase
				388	vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
				389	vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
				390	vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
				391	vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
				392
				393	// Compute ans store magnitude using L1 normalization
				394	vst1q_u16(magnitude + 0, mag_l1_S16_S16(gx_val.val[0], gy_val.val[0]));
				395	vst1q_u16(magnitude + 8, mag_l1_S16_S16(gx_val.val[1], gy_val.val[1]));
				396	vst1q_u16(magnitude + 16, mag_l1_S16_S16(gx_val.val[2], gy_val.val[2]));
				397	vst1q_u16(magnitude + 24, mag_l1_S16_S16(gx_val.val[3], gy_val.val[3]));
				398	}
				399
				400	/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
				401	*
				402	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S16
				403	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S16
				404	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
				405	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8
				406	*/
				407	void mag_phase_l2norm_S16_S16_U16_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				408	{
				409	const auto gx = static_cast<const int16_t *__restrict>(gx_ptr);
				410	const auto gy = static_cast<const int16_t *__restrict>(gy_ptr);
				411	const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
				412	const auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				413
				414	const int16x8x4_t gx_val =
				415	{
				416	{
				417	vld1q_s16(gx),
				418	vld1q_s16(gx + 8),
				419	vld1q_s16(gx + 16),
				420	vld1q_s16(gx + 24)
				421	}
				422	};
				423
				424	const int16x8x4_t gy_val =
				425	{
				426	{
				427	vld1q_s16(gy),
				428	vld1q_s16(gy + 8),
				429	vld1q_s16(gy + 16),
				430	vld1q_s16(gy + 24)
				431	}
				432	};
				433
				434	// Compute and store phase
				435	vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
				436	vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
				437	vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
				438	vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
				439
				440	// Compute and store magnitude using L2 normalization
				441	vst1q_u16(magnitude + 0, mag_l2_S16_S16(gx_val.val[0], gy_val.val[0]));
				442	vst1q_u16(magnitude + 8, mag_l2_S16_S16(gx_val.val[1], gy_val.val[1]));
				443	vst1q_u16(magnitude + 16, mag_l2_S16_S16(gx_val.val[2], gy_val.val[2]));
				444	vst1q_u16(magnitude + 24, mag_l2_S16_S16(gx_val.val[3], gy_val.val[3]));
				445	}
				446
				447	/* Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
				448	*
				449	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S32
				450	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S32
				451	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
				452	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type support U8
				453	*/
				454	void mag_phase_l1norm_S32_S32_U32_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				455	{
				456	auto gx = static_cast<const int32_t *__restrict>(gx_ptr);
				457	auto gy = static_cast<const int32_t *__restrict>(gy_ptr);
				458	auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
				459	auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				460
				461	// Process low and high part
				462	for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
				463	{
				464	const int32x4x2_t gx0 =
				465	{
				466	{
				467	vld1q_s32(gx + 0),
				468	vld1q_s32(gx + 4)
				469	}
				470	};
				471
				472	const int32x4x2_t gx1 =
				473	{
				474	{
				475	vld1q_s32(gx + 8),
				476	vld1q_s32(gx + 12)
				477	}
				478	};
				479
				480	const int32x4x2_t gy0 =
				481	{
				482	{
				483	vld1q_s32(gy + 0),
				484	vld1q_s32(gy + 4)
				485	}
				486	};
				487
				488	const int32x4x2_t gy1 =
				489	{
				490	{
				491	vld1q_s32(gy + 8),
				492	vld1q_s32(gy + 12)
				493	}
				494	};
				495
				496	// Compute and store phase
				497	vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
				498	vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
				499
				500	// Compute magnitude using L1 normalization
				501	const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
				502	const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
				503
				504	// Store magnitude
				505	vst1q_u32(magnitude + 0, mag0.val[0]);
				506	vst1q_u32(magnitude + 4, mag0.val[1]);
				507	vst1q_u32(magnitude + 8, mag1.val[0]);
				508	vst1q_u32(magnitude + 12, mag1.val[1]);
				509	}
				510	}
				511
				512	/* Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
				513	*
				514	* @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S32
				515	* @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S32
				516	* @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
				517	* @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8
				518	*/
				519	void mag_phase_l2norm_S32_S32_U32_U8(const void __restrict gx_ptr, const void __restrict gy_ptr, void __restrict magnitude_ptr, void __restrict phase_ptr)
				520	{
				521	auto gx = static_cast<const int32_t *__restrict>(gx_ptr);
				522	auto gy = static_cast<const int32_t *__restrict>(gy_ptr);
				523	auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
				524	auto phase = static_cast<uint8_t *__restrict>(phase_ptr);
				525
				526	// Process low and high part
				527	for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
				528	{
				529	const int32x4x2_t gx0 =
				530	{
				531	{
				532	vld1q_s32(gx + 0),
				533	vld1q_s32(gx + 4)
				534	}
				535	};
				536
				537	const int32x4x2_t gx1 =
				538	{
				539	{
				540	vld1q_s32(gx + 8),
				541	vld1q_s32(gx + 12)
				542	}
				543	};
				544
				545	const int32x4x2_t gy0 =
				546	{
				547	{
				548	vld1q_s32(gy + 0),
				549	vld1q_s32(gy + 4)
				550	}
				551	};
				552
				553	const int32x4x2_t gy1 =
				554	{
				555	{
				556	vld1q_s32(gy + 8),
				557	vld1q_s32(gy + 12)
				558	}
				559	};
				560
				561	// Compute and store phase
				562	vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
				563	vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
				564
				565	// Compute magnitude using L2 normalization
				566	const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
				567	const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
				568
				569	// Store magnitude
				570	vst1q_u32(magnitude + 0, mag0.val[0]);
				571	vst1q_u32(magnitude + 4, mag0.val[1]);
				572	vst1q_u32(magnitude + 8, mag1.val[0]);
				573	vst1q_u32(magnitude + 12, mag1.val[1]);
				574	}
				575	}
				576
				577	/* Computes non-maxima suppression and hysteresis when the gradient size = 3 or 5
				578	*
				579	* @param[in] magnitude_ptr Pointer to source image. Magnitude. Data type supported U16
				580	* @param[in] phase_ptr Pointer to source image. Quantized phase. Data type supported U8
				581	* @param[out] output_ptr Pointer to output image. Data type supported U8
				582	* @param[in] stride_mag Stride of magnitude image
				583	* @param[in] lower_thr Lower threshold used for the hysteresis
				584	* @param[in] upper_thr Upper threshold used for the hysteresis
				585	*/
				586	void non_max_suppression_U16_U8_U8(const void __restrict magnitude_ptr, const void __restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
				587	const int32_t upper_thr)
				588	{
				589	const auto magnitude = static_cast<const uint16_t *__restrict>(magnitude_ptr);
				590	const auto phase = static_cast<const uint8_t *__restrict>(phase_ptr);
				591	const auto output = static_cast<uint8_t *__restrict>(output_ptr);
				592
				593	// Get magnitude and phase of the centre pixels
				594	uint16x8_t mc = vld1q_u16(magnitude);
				595
				596	// Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				597	const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
				598
				599	// 0 degree
				600	const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1);
				601	const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1);
				602	uint16x8_t mask0 = vceqq_u16(pc16, vdupq_n_u16(0));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	603	mask0 = vandq_u16(mask0, vcgtq_u16(mc, mk0_0));
				604	mask0 = vandq_u16(mask0, vcgtq_u16(mc, mk0_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	605
				606	// 45 degree
				607	const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1);
				608	const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1);
				609	uint16x8_t mask1 = vceqq_u16(pc16, vdupq_n_u16(1));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	610	mask1 = vandq_u16(mask1, vcgtq_u16(mc, mk45_0));
				611	mask1 = vandq_u16(mask1, vcgtq_u16(mc, mk45_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	612
				613	// 90 degree
				614	const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag);
				615	const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag);
				616	uint16x8_t mask2 = vceqq_u16(pc16, vdupq_n_u16(2));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	617	mask2 = vandq_u16(mask2, vcgtq_u16(mc, mk90_0));
				618	mask2 = vandq_u16(mask2, vcgtq_u16(mc, mk90_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	619
				620	// 135 degree
				621	const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1);
				622	const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1);
				623	uint16x8_t mask3 = vceqq_u16(pc16, vdupq_n_u16(3));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	624	mask3 = vandq_u16(mask3, vcgtq_u16(mc, mk135_0));
				625	mask3 = vandq_u16(mask3, vcgtq_u16(mc, mk135_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	626
				627	// Merge masks
				628	mask0 = vorrq_u16(mask0, mask1);
				629	mask2 = vorrq_u16(mask2, mask3);
				630	mask0 = vorrq_u16(mask0, mask2);
				631
				632	mc = vbslq_u16(mask0, mc, vdupq_n_u16(0));
				633
				634	// mc > upper_thr
				635	mask0 = vcgtq_u16(mc, vdupq_n_u16(upper_thr));
				636
				637	// mc <= lower_thr
				638	mask1 = vcleq_u16(mc, vdupq_n_u16(lower_thr));
				639
				640	// mc <= upper_thr && mc > lower_thr
				641	mask2 = vcleq_u16(mc, vdupq_n_u16(upper_thr));
				642	mask2 = vandq_u16(mask2, vcgtq_u16(mc, vdupq_n_u16(lower_thr)));
				643
				644	mc = vbslq_u16(mask0, vdupq_n_u16(EDGE), mc);
				645	mc = vbslq_u16(mask1, vdupq_n_u16(NO_EDGE), mc);
				646	mc = vbslq_u16(mask2, vdupq_n_u16(MAYBE), mc);
				647
				648	vst1_u8(output, vmovn_u16(mc));
				649	}
				650
				651	inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
				652	{
				653	// Phase for 4 pixel
				654	const uint32x4_t pc32 = vmovl_u16(pc);
				655
				656	// Get magnitude for 4 pixel
				657	uint32x4_t mc = vld1q_u32(input);
				658
				659	// Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
				660	// 0 degree
				661	const uint32x4_t mk0_0 = vld1q_u32(input - 1);
				662	const uint32x4_t mk0_1 = vld1q_u32(input + 1);
				663	uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	664	mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
				665	mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	666
				667	// 45 degree
				668	const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1);
				669	const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1);
				670	uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	671	mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
				672	mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	673
				674	// 90 degree
				675	const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag);
				676	const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag);
				677	uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	678	mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
				679	mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	680
				681	// 135 degree
				682	const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1);
				683	const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1);
				684	uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3));
Abe Mbise	1b99338	2017-12-19 13:51:59 +0000	[diff] [blame]	685	mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
				686	mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	687
				688	// Merge masks
				689	mask0 = vorrq_u32(mask0, mask1);
				690	mask2 = vorrq_u32(mask2, mask3);
				691	mask0 = vorrq_u32(mask0, mask2);
				692
				693	mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
				694
				695	// mc > upper_thr
				696	mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
				697
				698	// mc <= lower_thr
				699	mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
				700
				701	// mc <= upper_thr && mc > lower_thr
				702	mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
				703	mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
				704
				705	mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
				706	mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
				707	mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
				708
				709	return vmovn_u32(mc);
				710	}
				711
				712	/* Computes non-maxima suppression and hysteresis when the gradient_size = 7
				713	*
				714	* @param[in] magnitude_ptr Pointer to source image. Magnitude. Data type supported U32
				715	* @param[in] phase_ptr Pointer to source image. Quantized phase. Data type supported U8
				716	* @param[out] output_ptr Pointer to destination image. Data type supported U8
				717	* @param[in] stride_mag Stride of magnitude image
				718	* @param[in] lower_thr Lower threshold used for the hysteresis
				719	* @param[in] upper_thr Upper threshold used for the hysteresis
				720	*/
				721	void non_max_suppression_U32_U8_U8(const void __restrict magnitude_ptr, const void __restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
				722	const int32_t upper_thr)
				723	{
				724	const auto magnitude = static_cast<const uint32_t *__restrict>(magnitude_ptr);
				725	const auto phase = static_cast<const uint8_t *__restrict>(phase_ptr);
				726	const auto output = static_cast<uint8_t *__restrict>(output_ptr);
				727
				728	// Get phase for 8 pixel
				729	const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
				730
				731	// Compute non maxima suppression
				732	const uint16x4x2_t res =
				733	{
				734	{
				735	non_max_U32_helper(magnitude, vget_low_u16(pc16), stride_mag, lower_thr, upper_thr),
				736	non_max_U32_helper(magnitude + 4, vget_high_u16(pc16), stride_mag, lower_thr, upper_thr)
				737	}
				738	};
				739
				740	// Store result
				741	vst1_u8(output, vmovn_u16(vcombine_u16(res.val[0], res.val[1])));
				742	}
				743
				744	/* Computes edge tracing when is called by edge_trace_U8_U8 recursively
				745	*
				746	* @param[in] input Pointer to source image. Data type supported U8
				747	* @param[out] output Pointer to destination image. Data type supported U8
				748	* @param[in] input_stride Stride of the input image
				749	* @param[in] output_stride Stride of the output image
				750	*/
				751	void edge_trace_recursive_U8_U8(uint8_t __restrict input, uint8_t __restrict output, const int32_t input_stride, const int32_t output_stride)
				752	{
				753	// Look for MAYBE pixels in 8 directions
				754	*output = EDGE;
				755
				756	// (-1, 0)
				757	uint8_t pixel = *(input - 1);
				758
				759	if(pixel == MAYBE)
				760	{
				761	// Touched a MAYBE point. MAYBE becomes EDGE
				762	*(input - 1) = EDGE;
				763
				764	edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
				765	}
				766
				767	// (+1, 0)
				768	pixel = *(input + 1);
				769
				770	if(pixel == MAYBE)
				771	{
				772	// Touched a MAYBE point. MAYBE becomes EDGE
				773	*(input + 1) = EDGE;
				774
				775	edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
				776	}
				777
				778	input -= input_stride;
				779	output -= output_stride;
				780
				781	// (-1, -1)
				782	pixel = *(input - 1);
				783
				784	if(pixel == MAYBE)
				785	{
				786	// Touched a MAYBE point. MAYBE becomes EDGE
				787	*(input - 1) = EDGE;
				788
				789	edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
				790	}
				791
				792	// (0, -1)
				793	pixel = *input;
				794
				795	if(pixel == MAYBE)
				796	{
				797	// Touched a MAYBE point. MAYBE becomes EDGE
				798	*input = EDGE;
				799
				800	edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
				801	}
				802
				803	// (+1, -1)
				804	pixel = *(input + 1);
				805
				806	if(pixel == MAYBE)
				807	{
				808	// Touched a MAYBE point. MAYBE becomes EDGE
				809	*(input + 1) = EDGE;
				810
				811	edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
				812	}
				813
				814	input += input_stride * 2;
				815	output += output_stride * 2;
				816
				817	// (-1, +1)
				818	pixel = *(input - 1);
				819
				820	if(pixel == MAYBE)
				821	{
				822	// Touched a MAYBE point. MAYBE becomes EDGE
				823	*(input - 1) = EDGE;
				824
				825	edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
				826	}
				827
				828	// (0, +1)
				829	pixel = *input;
				830
				831	if(pixel == MAYBE)
				832	{
				833	// Touched a MAYBE point. MAYBE becomes EDGE
				834	*input = EDGE;
				835
				836	edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
				837	}
				838
				839	// (+1, +1)
				840	pixel = *(input + 1);
				841
				842	if(pixel == MAYBE)
				843	{
				844	// Touched a MAYBE point. MAYBE becomes EDGE
				845	*(input + 1) = EDGE;
				846
				847	edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
				848	}
				849	}
				850
				851	/* Computes edge tracing
				852	*
				853	* @param[in] input Pointer to source image. Data type supported U8
				854	* @param[out] output Pointer to destination image. Data type supported U8
				855	* @param[in] input_stride Stride of the input image
				856	* @param[in] output_stride Stride of the output image
				857	*/
				858	void edge_trace_U8_U8(uint8_t __restrict input, uint8_t __restrict output, const int32_t input_stride, const int32_t output_stride)
				859	{
				860	if(*input == NO_EDGE)
				861	{
				862	*output = NO_EDGE;
				863	}
				864	// Check if EDGE and not yet touched
				865	else if((input == EDGE) && (output == NO_EDGE))
				866	{
				867	edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
				868	}
				869	}
				870	} // namespace
				871
				872	NEGradientKernel::NEGradientKernel()
				873	: _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
				874	{
				875	}
				876
				877	void NEGradientKernel::configure(const ITensor gx, const ITensor gy, ITensor magnitude, ITensor phase, int32_t norm_type)
				878	{
				879	ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
				880
				881	set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
				882	set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
				883
				884	Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
				885	set_format_if_unknown(*magnitude->info(), magnitude_format);
				886	set_format_if_unknown(*phase->info(), Format::U8);
				887
				888	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
				889	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
				890	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
				891	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
				892	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				893	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
				894	ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
				895
				896	_gx = gx;
				897	_gy = gy;
				898	_magnitude = magnitude;
				899	_phase = phase;
				900
				901	if(_gx->info()->data_type() == DataType::S16)
				902	{
				903	if(norm_type == 1)
				904	{
				905	_func = &mag_phase_l1norm_S16_S16_U16_U8;
				906	}
				907	else
				908	{
				909	_func = &mag_phase_l2norm_S16_S16_U16_U8;
				910	}
				911	}
				912	else
				913	{
				914	if(norm_type == 1)
				915	{
				916	_func = &mag_phase_l1norm_S32_S32_U32_U8;
				917	}
				918	else
				919	{
				920	_func = &mag_phase_l2norm_S32_S32_U32_U8;
				921	}
				922	}
				923
				924	constexpr unsigned int num_elems_processed_per_iteration = 32;
				925
				926	// Configure kernel window
				927	Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
				928
				929	AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
				930	AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
				931	AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
				932	AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
				933
Michalis Spyrou	ebdde65	2019-07-08 11:52:46 +0100	[diff] [blame]	934	ARM_COMPUTE_UNUSED(update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	935
				936	mag_access.set_valid_region(win, _gx->info()->valid_region());
				937	phase_access.set_valid_region(win, _gx->info()->valid_region());
				938
				939	INEKernel::configure(win);
				940	}
				941
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	942	void NEGradientKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	943	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	944	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	945	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				946	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				947	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				948	Iterator gx(_gx, window);
				949	Iterator gy(_gy, window);
				950	Iterator magnitude(_magnitude, window);
				951	Iterator phase(_phase, window);
				952
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	953	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	954	{
				955	(*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
				956	},
				957	gx, gy, magnitude, phase);
				958	}
				959
				960	NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
				961	: _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
				962	{
				963	}
				964
				965	BorderSize NEEdgeNonMaxSuppressionKernel::border_size() const
				966	{
				967	return BorderSize(1);
				968	}
				969
				970	void NEEdgeNonMaxSuppressionKernel::configure(const ITensor magnitude, const ITensor phase, ITensor *output,
				971	int32_t upper_thr, int32_t lower_thr, bool border_undefined)
				972	{
				973	ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
				974
				975	set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
				976
				977	set_format_if_unknown(*phase->info(), Format::U8);
				978	set_format_if_unknown(*output->info(), Format::U8);
				979
				980	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
				981	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
				982	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
				983	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
				984	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
				985
				986	_magnitude = magnitude;
				987	_phase = phase;
				988	_output = output;
				989
				990	switch(_magnitude->info()->data_type())
				991	{
				992	case DataType::U16:
				993	_func = &non_max_suppression_U16_U8_U8;
				994	break;
				995	case DataType::U32:
				996	_func = &non_max_suppression_U32_U8_U8;
				997	break;
				998	default:
				999	ARM_COMPUTE_ERROR("Unsupported data type!");
				1000	}
				1001
				1002	// Set thresholds
				1003	_lower_thr = lower_thr;
				1004	_upper_thr = upper_thr;
				1005
				1006	constexpr unsigned int num_elems_processed_per_iteration = 8;
				1007	constexpr unsigned int num_elems_read_per_iteration = 10;
				1008	constexpr unsigned int num_rows_read_per_iteration = 3;
				1009
				1010	// Configure kernel window
				1011	Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
				1012
				1013	AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
				1014	AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
				1015	AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
				1016
				1017	update_window_and_padding(win, mag_access, phase_access, output_access);
				1018
				1019	output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
				1020
				1021	INEKernel::configure(win);
				1022	}
				1023
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1024	void NEEdgeNonMaxSuppressionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1025	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1026	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1027	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1028	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1029	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				1030	Iterator magnitude(_magnitude, window);
				1031	Iterator phase(_phase, window);
				1032	Iterator output(_output, window);
				1033
				1034	const size_t input1_stride = _magnitude->info()->strides_in_bytes()[1];
				1035	const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
				1036
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1037	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1038	{
				1039	(*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
				1040	},
				1041	magnitude, phase, output);
				1042	}
				1043
				1044	NEEdgeTraceKernel::NEEdgeTraceKernel()
				1045	: _input(nullptr), _output(nullptr)
				1046	{
				1047	}
				1048
				1049	BorderSize NEEdgeTraceKernel::border_size() const
				1050	{
				1051	return BorderSize(1);
				1052	}
				1053
				1054	bool NEEdgeTraceKernel::is_parallelisable() const
				1055	{
				1056	return false;
				1057	}
				1058
				1059	void NEEdgeTraceKernel::configure(ITensor input, ITensor output)
				1060	{
				1061	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
				1062
				1063	set_shape_if_empty(*output->info(), input->info()->tensor_shape());
				1064
				1065	set_format_if_unknown(*input->info(), Format::U8);
				1066	set_format_if_unknown(*output->info(), Format::U8);
				1067
				1068	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
				1069	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				1070	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
				1071	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				1072
				1073	_input = input;
				1074	_output = output;
				1075
				1076	constexpr unsigned int num_elems_processed_per_iteration = 1;
				1077
				1078	// Configure kernel window
				1079	Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
				1080
				1081	const ValidRegion &input_valid_region = input->info()->valid_region();
				1082	const ValidRegion &output_valid_region = output->info()->valid_region();
				1083
				1084	// Reads can occur within the valid region of the input + border
				1085	AccessWindowStatic input_access(input->info(),
				1086	input_valid_region.anchor[0] - border_size().left,
				1087	input_valid_region.anchor[1] - border_size().top,
				1088	input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
				1089	input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
				1090
				1091	// Writes can occur within the valid region of the output + border
				1092	AccessWindowStatic output_access(output->info(),
				1093	output_valid_region.anchor[0] - border_size().left,
				1094	output_valid_region.anchor[1] - border_size().top,
				1095	output_valid_region.anchor[0] + output_valid_region.shape[0] + border_size().right,
				1096	output_valid_region.anchor[1] + output_valid_region.shape[1] + border_size().bottom);
				1097
				1098	update_window_and_padding(win, input_access, output_access);
				1099
				1100	output_access.set_valid_region(win, _input->info()->valid_region());
				1101
				1102	INEKernel::configure(win);
				1103	}
				1104
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1105	void NEEdgeTraceKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1106	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	1107	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1108	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				1109	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				1110	Iterator input(_input, window);
				1111	Iterator output(_output, window);
				1112
				1113	const size_t input_stride = _input->info()->strides_in_bytes()[1];
				1114	const size_t output_stride = _output->info()->strides_in_bytes()[1];
				1115
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	1116	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1117	{
				1118	edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
				1119	},
				1120	input, output);
				1121	}