Blame - src/core/NEON/kernels/NEHOGDescriptorKernel.cpp - ml/ComputeLibrary

blob: 089cd34e0cbdff93e6abd0ee9e9502918beb8be8 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	2	* Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Michalis Spyrou	ebcebf1	2020-10-21 00:04:14 +0100	[diff] [blame]	24	#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/HOGInfo.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/IAccessWindow.h"
				30	#include "arm_compute/core/Validate.h"
Sang-Hoon Park	68dd25f	2020-10-19 16:00:11 +0100	[diff] [blame]	31	#include "src/core/helpers/AutoConfiguration.h"
				32	#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	33
				34	#include <algorithm>
				35	#include <arm_neon.h>
				36	#include <cstring>
				37
				38	using namespace arm_compute;
				39
				40	namespace
				41	{
				42	void cell_width_lt8(const int16_t __restrict mag_row_ptr, const uint8_t __restrict phase_row_ptr, float *__restrict output_ptr,
				43	size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale)
				44	{
				45	const float32x4_t scale_f32 = vdupq_n_f32(phase_scale);
				46	static const float32x4_t one_f32 = vdupq_n_f32(1.0f);
				47	static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
				48	static const int32x4_t zero_s32 = vdupq_n_s32(0);
				49	static const int32x4_t one_s32 = vdupq_n_s32(1);
				50	const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
				51
				52	memset(output_ptr, 0, sizeof(float) * num_bins);
				53
				54	for(size_t yc = 0; yc < cell_height; ++yc)
				55	{
				56	int32_t xc = 0;
				57
				58	for(; xc <= static_cast<int32_t>(cell_width) - 4; xc += 4)
				59	{
				60	// Load magnitude and phase values
				61	const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
				62	const int16x4_t mag_s16 = vld1_s16(mag_row_ptr + xc + yc * mag_stride);
				63
				64	// Convert magnitude and phase to float
				65	const float32x4_t mag_f32 = vcvtq_f32_s32(vmovl_s16(mag_s16));
				66	float32x4_t phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8))));
				67
				68	// Scale phase: phase * scale + 0.5f
				69	phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32);
				70
				71	// Compute histogram index.
				72	int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
				73
				74	// Compute magnitude weights (w0 and w1)
				75	const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
				76
				77	// w1 = phase_f32 - hidx_f32
				78	const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
				79
				80	// w0 = 1.0 - w1
				81	const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32);
				82
				83	// Compute contribute for splitting vote
				84	const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32);
				85	const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32);
				86
				87	// Weighted vote between 2 bins
				88
				89	// Check if the histogram index is equal to num_bins. If so, replace the index with 0
				90	uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
				91	hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
				92
				93	// Bin 0
				94	*(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
				95	*(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
				96	*(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2);
				97	*(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3);
				98
				99	hidx_s32 = vaddq_s32(hidx_s32, one_s32);
				100
				101	// Check if the histogram index is equal to num_bins
				102	mask = vceqq_s32(hidx_s32, num_bins_s32);
				103	hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
				104
				105	// Bin1
				106	*(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0);
				107	*(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1);
				108	*(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2);
				109	*(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
				110	}
				111
				112	for(; xc < static_cast<int32_t>(cell_width); ++xc)
				113	{
				114	const float phase_value = (phase_row_ptr + xc + yc phase_stride) * phase_scale + 0.5f;
				115	const float mag_value = (mag_row_ptr + xc + yc mag_stride);
				116
				117	const float w1 = phase_value - std::floor(phase_value);
				118
				119	// The quantised phase is the histogram index [0, num_bins - 1] - Round
				120	// Check limit of histogram index. If hidx == num_bins, hidx = 0
				121	const auto hidx = static_cast<size_t>(phase_value) % num_bins;
				122
				123	// Weighted vote between 2 bins
				124	(output_ptr + hidx) += mag_value (1.0f - w1);
				125	(output_ptr + ((hidx + 1) % (num_bins))) += mag_value w1;
				126	}
				127	}
				128	}
				129
				130	void cell_width_ge8(const int16_t __restrict mag_row_ptr, const uint8_t __restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
				131	size_t cell_height, size_t num_bins, float phase_scale)
				132	{
				133	const float32x4_t scale_f32 = vdupq_n_f32(phase_scale);
				134	static const float32x4_t one_f32 = vdupq_n_f32(1.0f);
				135	static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
				136	static const int32x4_t zero_s32 = vdupq_n_s32(0);
				137	static const int32x4_t one_s32 = vdupq_n_s32(1);
				138	const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
				139
				140	memset(output_ptr, 0, sizeof(float) * num_bins);
				141
				142	for(size_t yc = 0; yc < cell_height; ++yc)
				143	{
				144	int32_t xc = 0;
				145
				146	for(; xc <= static_cast<int32_t>(cell_width) - 8; xc += 8)
				147	{
				148	// Load magnitude and phase values
				149	const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
				150	const int16x8_t mag_s16 = vld1q_s16(mag_row_ptr + xc + yc * mag_stride);
				151
				152	// Convert phase to U16
				153	const uint16x8_t phase_u16 = vmovl_u8(phase_u8);
				154
				155	// Convert magnitude to float32
				156	const float32x4x2_t mag_f32 =
				157	{
				158	{
				159	vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))),
				160	vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16)))
				161	}
				162	};
				163
				164	// Convert phase to float32
				165	float32x4x2_t phase_f32 =
				166	{
				167	{
				168	vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))),
				169	vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16)))
				170	}
				171	};
				172
				173	// Scale phase: phase * scale + 0.5f
				174	phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32);
				175	phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32);
				176
				177	// Compute histogram index.
				178	int32x4x2_t hidx_s32 =
				179	{
				180	{
				181	vcvtq_s32_f32(phase_f32.val[0]),
				182	vcvtq_s32_f32(phase_f32.val[1])
				183	}
				184	};
				185
				186	// Compute magnitude weights (w0 and w1)
				187	const float32x4x2_t hidx_f32 =
				188	{
				189	{
				190	vcvtq_f32_s32(hidx_s32.val[0]),
				191	vcvtq_f32_s32(hidx_s32.val[1])
				192	}
				193	};
				194
				195	float32x4x2_t w1_f32 =
				196	{
				197	{
				198	vsubq_f32(phase_f32.val[0], hidx_f32.val[0]),
				199	vsubq_f32(phase_f32.val[1], hidx_f32.val[1])
				200	}
				201	};
				202
				203	float32x4x2_t w0_f32 =
				204	{
				205	{
				206	vsubq_f32(one_f32, w1_f32.val[0]),
				207	vsubq_f32(one_f32, w1_f32.val[1])
				208	}
				209	};
				210
				211	// Compute contribute for splitting vote
				212	const float32x4x2_t mag_w0_f32 =
				213	{
				214	{
				215	vmulq_f32(mag_f32.val[0], w0_f32.val[0]),
				216	vmulq_f32(mag_f32.val[1], w0_f32.val[1])
				217	}
				218	};
				219
				220	const float32x4x2_t mag_w1_f32 =
				221	{
				222	{
				223	vmulq_f32(mag_f32.val[0], w1_f32.val[0]),
				224	vmulq_f32(mag_f32.val[1], w1_f32.val[1])
				225	}
				226	};
				227
				228	// Weighted vote between 2 bins
				229
				230	// Check if the histogram index is equal to num_bins
				231	uint32x4x2_t mask =
				232	{
				233	{
				234	vceqq_s32(hidx_s32.val[0], num_bins_s32),
				235	vceqq_s32(hidx_s32.val[1], num_bins_s32)
				236	}
				237	};
				238
				239	hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
				240	hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
				241
				242	// First bin - Low
				243	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0);
				244	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1);
				245	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2);
				246	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3);
				247
				248	// First bin - high
				249	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0);
				250	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1);
				251	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2);
				252	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3);
				253
				254	hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32);
				255	hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32);
				256
				257	// Check if the histogram index is equal to num_bins
				258	mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32);
				259	mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32);
				260
				261	hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
				262	hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
				263
				264	// Second bin - Low
				265	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0);
				266	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1);
				267	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2);
				268	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3);
				269
				270	// Second bin - high
				271	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0);
				272	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1);
				273	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2);
				274	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3);
				275	}
				276
				277	for(; xc < static_cast<int32_t>(cell_width); xc++)
				278	{
				279	const float phase_value = (phase_row_ptr + xc + yc phase_stride) * phase_scale + 0.5f;
				280	const float mag_value = (mag_row_ptr + xc + yc mag_stride);
				281
				282	const float w1 = phase_value - std::floor(phase_value);
				283
				284	// The quantised phase is the histogram index [0, num_bins - 1] - Round
				285	// Check limit of histogram index. If hidx == num_bins, hidx = 0
				286	const size_t hidx = static_cast<size_t>(phase_value) % num_bins;
				287
				288	// Weighted vote between 2 bins
				289	(output_ptr + hidx) += mag_value (1.0f - w1);
				290	(output_ptr + ((hidx + 1) % (num_bins))) += mag_value w1;
				291	}
				292	}
				293	}
				294
				295	void l2_norm(const float __restrict input_row_ptr, float __restrict output_ptr, size_t input_stride,
				296	size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold)
				297	{
				298	ARM_COMPUTE_UNUSED(l2_hyst_threshold);
				299
				300	float sum = 0.0f;
				301	float32x4_t sum_f32 = vdupq_n_f32(0.0f);
				302
				303	// Compute L2-Norm
				304	for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
				305	{
				306	const float const hist_ptr = input_row_ptr + yc input_stride;
				307
				308	int32_t xc = 0;
				309
				310	for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
				311	{
				312	const float32x4x4_t input_value =
				313	{
				314	{
				315	vld1q_f32(hist_ptr + xc + 0),
				316	vld1q_f32(hist_ptr + xc + 4),
				317	vld1q_f32(hist_ptr + xc + 8),
				318	vld1q_f32(hist_ptr + xc + 12)
				319	}
				320	};
				321
				322	// Compute input_value^2
				323	sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
				324	sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
				325	sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
				326	sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
				327
				328	vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
				329	vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
				330	vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
				331	vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
				332	}
				333
				334	// Compute left over
				335	for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
				336	{
				337	const float input_value = hist_ptr[xc];
				338
				339	sum += input_value * input_value;
				340
				341	output_ptr[xc + yc * num_bins_block_x] = input_value;
				342	}
				343	}
				344
				345	sum += vgetq_lane_f32(sum_f32, 0);
				346	sum += vgetq_lane_f32(sum_f32, 1);
				347	sum += vgetq_lane_f32(sum_f32, 2);
				348	sum += vgetq_lane_f32(sum_f32, 3);
				349
				350	const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
				351	const float32x4_t scale_f32 = vdupq_n_f32(scale);
				352
				353	int32_t i = 0;
				354
				355	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				356	{
				357	float32x4x4_t input_value =
				358	{
				359	{
				360	vld1q_f32(&output_ptr[i + 0]),
				361	vld1q_f32(&output_ptr[i + 4]),
				362	vld1q_f32(&output_ptr[i + 8]),
				363	vld1q_f32(&output_ptr[i + 12])
				364	}
				365	};
				366
				367	// Scale input_value
				368	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				369	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				370	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				371	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				372
				373	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				374	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				375	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				376	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				377	}
				378
				379	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				380	{
				381	output_ptr[i] *= scale;
				382	}
				383	}
				384
				385	void l2hys_norm(const float __restrict input_row_ptr, float __restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
				386	float l2_hyst_threshold)
				387	{
				388	float sum = 0.0f;
				389	float32x4_t sum_f32 = vdupq_n_f32(0.0f);
				390
				391	// Compute L2-Hys
				392	for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
				393	{
				394	const float const hist_ptr = input_row_ptr + yc input_stride;
				395
				396	int32_t xc = 0;
				397
				398	for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
				399	{
				400	const float32x4x4_t input_value =
				401	{
				402	{
				403	vld1q_f32(hist_ptr + xc + 0),
				404	vld1q_f32(hist_ptr + xc + 4),
				405	vld1q_f32(hist_ptr + xc + 8),
				406	vld1q_f32(hist_ptr + xc + 12)
				407	}
				408	};
				409
				410	// Compute input_value^2
				411	sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
				412	sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
				413	sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
				414	sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
				415
				416	vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
				417	vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
				418	vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
				419	vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
				420	}
				421
				422	// Compute left over
				423	for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
				424	{
				425	const float input_value = hist_ptr[xc];
				426
				427	sum += input_value * input_value;
				428
				429	output_ptr[xc + yc * num_bins_block_x] = input_value;
				430	}
				431	}
				432
				433	sum += vgetq_lane_f32(sum_f32, 0);
				434	sum += vgetq_lane_f32(sum_f32, 1);
				435	sum += vgetq_lane_f32(sum_f32, 2);
				436	sum += vgetq_lane_f32(sum_f32, 3);
				437
				438	float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
				439	float32x4_t scale_f32 = vdupq_n_f32(scale);
				440	const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
				441
				442	// Reset sum
				443	sum_f32 = vdupq_n_f32(0.0f);
				444	sum = 0.0f;
				445
				446	int32_t i = 0;
				447
				448	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				449	{
				450	float32x4x4_t input_value =
				451	{
				452	{
				453	vld1q_f32(&output_ptr[i + 0]),
				454	vld1q_f32(&output_ptr[i + 4]),
				455	vld1q_f32(&output_ptr[i + 8]),
				456	vld1q_f32(&output_ptr[i + 12])
				457	}
				458	};
				459
				460	// Scale input_value
				461	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				462	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				463	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				464	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				465
				466	// Clip input_value if over _threshold_l2hys
				467	input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
				468	input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
				469	input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
				470	input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
				471
				472	// Compute input_value^2
				473	sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
				474	sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
				475	sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
				476	sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
				477
				478	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				479	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				480	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				481	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				482	}
				483
				484	sum += vgetq_lane_f32(sum_f32, 0);
				485	sum += vgetq_lane_f32(sum_f32, 1);
				486	sum += vgetq_lane_f32(sum_f32, 2);
				487	sum += vgetq_lane_f32(sum_f32, 3);
				488
				489	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				490	{
				491	float input_value = output_ptr[i] * scale;
				492
				493	// Clip scaled input_value if over _threshold_L2hys
				494	input_value = std::min(input_value, l2_hyst_threshold);
				495
				496	sum += input_value * input_value;
				497
				498	output_ptr[i] = input_value;
				499	}
				500
				501	// We use the same constants of OpenCV
				502	scale = 1.0f / (std::sqrt(sum) + 1e-3f);
				503	scale_f32 = vdupq_n_f32(scale);
				504
				505	// Rescale
				506	i = 0;
				507
				508	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				509	{
				510	float32x4x4_t input_value =
				511	{
				512	{
				513	vld1q_f32(&output_ptr[i + 0]),
				514	vld1q_f32(&output_ptr[i + 4]),
				515	vld1q_f32(&output_ptr[i + 8]),
				516	vld1q_f32(&output_ptr[i + 12])
				517	}
				518	};
				519
				520	// Scale input_value
				521	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				522	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				523	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				524	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				525
				526	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				527	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				528	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				529	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				530	}
				531
				532	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				533	{
				534	// Store result
				535	output_ptr[i] *= scale;
				536	}
				537	}
				538
				539	void l1_norm(const float __restrict input_row_ptr, float __restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
				540	float l2_hyst_threshold)
				541	{
				542	ARM_COMPUTE_UNUSED(l2_hyst_threshold);
				543
				544	float sum = 0.0f;
				545	float32x4_t sum_f32 = vdupq_n_f32(0.0f);
				546
				547	// Compute L1-Norm
				548	for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
				549	{
				550	const float const hist_ptr = input_row_ptr + yc input_stride;
				551
				552	int32_t xc = 0;
				553
				554	for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
				555	{
				556	const float32x4x4_t input_value =
				557	{
				558	{
				559	vld1q_f32(hist_ptr + xc + 0),
				560	vld1q_f32(hist_ptr + xc + 4),
				561	vld1q_f32(hist_ptr + xc + 8),
				562	vld1q_f32(hist_ptr + xc + 12)
				563	}
				564	};
				565
				566	// Compute \|input_value\|
				567	sum_f32 += vabsq_f32(input_value.val[0]);
				568	sum_f32 += vabsq_f32(input_value.val[1]);
				569	sum_f32 += vabsq_f32(input_value.val[2]);
				570	sum_f32 += vabsq_f32(input_value.val[3]);
				571
				572	vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
				573	vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
				574	vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
				575	vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
				576	}
				577
				578	for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
				579	{
				580	const float input_value = hist_ptr[xc];
				581
				582	sum += std::abs(input_value);
				583
				584	output_ptr[xc + yc * num_bins_block_x] = input_value;
				585	}
				586	}
				587
				588	sum += vgetq_lane_f32(sum_f32, 0);
				589	sum += vgetq_lane_f32(sum_f32, 1);
				590	sum += vgetq_lane_f32(sum_f32, 2);
				591	sum += vgetq_lane_f32(sum_f32, 3);
				592
				593	const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
				594	const float32x4_t scale_f32 = vdupq_n_f32(scale);
				595
				596	int32_t i = 0;
				597
				598	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				599	{
				600	float32x4x4_t input_value =
				601	{
				602	{
				603	vld1q_f32(&output_ptr[i + 0]),
				604	vld1q_f32(&output_ptr[i + 4]),
				605	vld1q_f32(&output_ptr[i + 8]),
				606	vld1q_f32(&output_ptr[i + 12])
				607	}
				608	};
				609
				610	// Scale input_value
				611	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				612	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				613	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				614	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				615
				616	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				617	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				618	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				619	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				620	}
				621
				622	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				623	{
				624	output_ptr[i] *= scale;
				625	}
				626	}
				627	} // namespace
				628
				629	NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel()
				630	: _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0)
				631	{
				632	}
				633
				634	void NEHOGOrientationBinningKernel::configure(const ITensor input_magnitude, const ITensor input_phase, ITensor output, const HOGInfo hog_info)
				635	{
				636	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
				637	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
				638	ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
				639	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
				640	ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
				641	ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
				642
				643	_input_magnitude = input_magnitude;
				644	_input_phase = input_phase;
				645	_output = output;
				646	_cell_width = hog_info->cell_size().width;
				647	_cell_height = hog_info->cell_size().height;
				648	_num_bins = hog_info->num_bins();
				649	_phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f);
				650	_phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
				651
				652	if(_cell_width < 8)
				653	{
				654	_func = &cell_width_lt8;
				655	}
				656	else
				657	{
				658	_func = &cell_width_ge8;
				659	}
				660
				661	constexpr unsigned int num_elems_processed_per_iteration = 1;
				662	const unsigned int num_elems_read_per_iteration = 1;
				663	const unsigned int num_rows_read_per_iteration = _cell_height;
				664	const unsigned int num_elems_written_per_iteration = 1;
				665
				666	// Configure kernel window
				667	Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
				668	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				669
				670	update_window_and_padding(win,
				671	AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
				672	AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
				673	output_access);
				674
				675	output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
				676
				677	INEKernel::configure(win);
				678	}
				679
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	680	void NEHOGOrientationBinningKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	681	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	682	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	683	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				684	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				685	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				686
				687	const size_t mag_stride = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format());
				688	const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format());
				689
				690	Window win_mag(window);
				691	win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width));
				692	win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height));
				693
				694	Window win_phase(win_mag);
				695
				696	Iterator mag(_input_magnitude, win_mag);
				697	Iterator phase(_input_phase, win_phase);
				698	Iterator out(_output, window);
				699
Michalis Spyrou	a4f378d	2019-04-26 14:54:54 +0100	[diff] [blame]	700	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	701	{
				702	const auto mag_row_ptr = reinterpret_cast<const int16_t *>(mag.ptr());
				703	const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
				704	const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
				705
				706	(*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale);
				707	},
				708	mag, phase, out);
				709	}
				710
				711	NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel()
				712	: _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f)
				713	{
				714	}
				715
				716	void NEHOGBlockNormalizationKernel::configure(const ITensor input, ITensor output, const HOGInfo *hog_info)
				717	{
				718	ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
				719	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
				720	ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
				721
				722	// Number of cells per block
				723	const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
				724	hog_info->block_size().height / hog_info->cell_size().height);
				725
				726	// Number of cells per block stride
				727	const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
				728	hog_info->block_stride().height / hog_info->cell_size().height);
				729
				730	_input = input;
				731	_output = output;
				732	_l2_hyst_threshold = hog_info->l2_hyst_threshold();
				733	_num_cells_per_block = num_cells_per_block;
				734	_num_cells_per_block_stride = num_cells_per_block_stride;
				735	_num_bins = hog_info->num_bins();
				736
				737	ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height)));
				738
				739	switch(hog_info->normalization_type())
				740	{
				741	case HOGNormType::L2_NORM:
				742	_func = &l2_norm;
				743	break;
				744	case HOGNormType::L2HYS_NORM:
				745	_func = &l2hys_norm;
				746	break;
				747	case HOGNormType::L1_NORM:
				748	_func = &l1_norm;
				749	break;
				750	default:
				751	ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
				752	break;
				753	}
				754
				755	constexpr unsigned int num_elems_processed_per_iteration = 1;
				756	const unsigned int num_elems_read_per_iteration = 1;
				757	const unsigned int num_rows_read_per_iteration = _num_cells_per_block.height;
				758	const unsigned int num_elems_written_per_iteration = 1;
				759	const unsigned int num_rows_written_per_iteration = _num_cells_per_block.height;
				760
				761	// Configure kernel window
				762	Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
				763	AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
				764
				765	update_window_and_padding(win,
				766	AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
				767	output_access);
				768
John Richardson	684cb0f	2018-01-09 11:17:00 +0000	[diff] [blame]	769	output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	770
				771	INEKernel::configure(win);
				772	}
				773
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	774	void NEHOGBlockNormalizationKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	775	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	776	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	777	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				778	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				779	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				780
				781	// Get number of bins per block
				782	const size_t num_bins_per_block = _output->info()->num_channels();
				783
				784	// Number of bins on the same row of the block
				785	const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
				786
				787	const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
				788
				789	Window win_in(window);
				790	win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
John Richardson	7f4a819	2018-02-05 15:12:22 +0000	[diff] [blame]	791	win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	792
				793	Iterator in(_input, win_in);
				794	Iterator out(_output, window);
				795
				796	// Normalises blocks
				797	execute_window_loop(window, [&](const Coordinates & id)
				798	{
John Richardson	7f4a819	2018-02-05 15:12:22 +0000	[diff] [blame]	799	const auto input_row_ptr = reinterpret_cast<const float >(in.ptr() + id.y() _num_cells_per_block_stride.height * _input->info()->strides_in_bytes()[Window::DimY]);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	800	const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
				801
				802	// Execute normalization function
				803	(*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
				804	},
				805	in, out);
				806	}