Blame - src/core/NEON/kernels/NEHOGDescriptorKernel.cpp - ml/ComputeLibrary

blob: 3fd81bed1c8effcd3f7edcc4ede452ae5567484c [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/HOGInfo.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/IAccessWindow.h"
				30	#include "arm_compute/core/Validate.h"
				31
				32	#include <algorithm>
				33	#include <arm_neon.h>
				34	#include <cstring>
				35
				36	using namespace arm_compute;
				37
				38	namespace
				39	{
				40	void cell_width_lt8(const int16_t __restrict mag_row_ptr, const uint8_t __restrict phase_row_ptr, float *__restrict output_ptr,
				41	size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale)
				42	{
				43	const float32x4_t scale_f32 = vdupq_n_f32(phase_scale);
				44	static const float32x4_t one_f32 = vdupq_n_f32(1.0f);
				45	static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
				46	static const int32x4_t zero_s32 = vdupq_n_s32(0);
				47	static const int32x4_t one_s32 = vdupq_n_s32(1);
				48	const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
				49
				50	memset(output_ptr, 0, sizeof(float) * num_bins);
				51
				52	for(size_t yc = 0; yc < cell_height; ++yc)
				53	{
				54	int32_t xc = 0;
				55
				56	for(; xc <= static_cast<int32_t>(cell_width) - 4; xc += 4)
				57	{
				58	// Load magnitude and phase values
				59	const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
				60	const int16x4_t mag_s16 = vld1_s16(mag_row_ptr + xc + yc * mag_stride);
				61
				62	// Convert magnitude and phase to float
				63	const float32x4_t mag_f32 = vcvtq_f32_s32(vmovl_s16(mag_s16));
				64	float32x4_t phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8))));
				65
				66	// Scale phase: phase * scale + 0.5f
				67	phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32);
				68
				69	// Compute histogram index.
				70	int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
				71
				72	// Compute magnitude weights (w0 and w1)
				73	const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
				74
				75	// w1 = phase_f32 - hidx_f32
				76	const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
				77
				78	// w0 = 1.0 - w1
				79	const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32);
				80
				81	// Compute contribute for splitting vote
				82	const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32);
				83	const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32);
				84
				85	// Weighted vote between 2 bins
				86
				87	// Check if the histogram index is equal to num_bins. If so, replace the index with 0
				88	uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
				89	hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
				90
				91	// Bin 0
				92	*(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
				93	*(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
				94	*(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2);
				95	*(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3);
				96
				97	hidx_s32 = vaddq_s32(hidx_s32, one_s32);
				98
				99	// Check if the histogram index is equal to num_bins
				100	mask = vceqq_s32(hidx_s32, num_bins_s32);
				101	hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
				102
				103	// Bin1
				104	*(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0);
				105	*(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1);
				106	*(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2);
				107	*(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
				108	}
				109
				110	for(; xc < static_cast<int32_t>(cell_width); ++xc)
				111	{
				112	const float phase_value = (phase_row_ptr + xc + yc phase_stride) * phase_scale + 0.5f;
				113	const float mag_value = (mag_row_ptr + xc + yc mag_stride);
				114
				115	const float w1 = phase_value - std::floor(phase_value);
				116
				117	// The quantised phase is the histogram index [0, num_bins - 1] - Round
				118	// Check limit of histogram index. If hidx == num_bins, hidx = 0
				119	const auto hidx = static_cast<size_t>(phase_value) % num_bins;
				120
				121	// Weighted vote between 2 bins
				122	(output_ptr + hidx) += mag_value (1.0f - w1);
				123	(output_ptr + ((hidx + 1) % (num_bins))) += mag_value w1;
				124	}
				125	}
				126	}
				127
				128	void cell_width_ge8(const int16_t __restrict mag_row_ptr, const uint8_t __restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
				129	size_t cell_height, size_t num_bins, float phase_scale)
				130	{
				131	const float32x4_t scale_f32 = vdupq_n_f32(phase_scale);
				132	static const float32x4_t one_f32 = vdupq_n_f32(1.0f);
				133	static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
				134	static const int32x4_t zero_s32 = vdupq_n_s32(0);
				135	static const int32x4_t one_s32 = vdupq_n_s32(1);
				136	const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
				137
				138	memset(output_ptr, 0, sizeof(float) * num_bins);
				139
				140	for(size_t yc = 0; yc < cell_height; ++yc)
				141	{
				142	int32_t xc = 0;
				143
				144	for(; xc <= static_cast<int32_t>(cell_width) - 8; xc += 8)
				145	{
				146	// Load magnitude and phase values
				147	const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
				148	const int16x8_t mag_s16 = vld1q_s16(mag_row_ptr + xc + yc * mag_stride);
				149
				150	// Convert phase to U16
				151	const uint16x8_t phase_u16 = vmovl_u8(phase_u8);
				152
				153	// Convert magnitude to float32
				154	const float32x4x2_t mag_f32 =
				155	{
				156	{
				157	vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))),
				158	vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16)))
				159	}
				160	};
				161
				162	// Convert phase to float32
				163	float32x4x2_t phase_f32 =
				164	{
				165	{
				166	vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))),
				167	vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16)))
				168	}
				169	};
				170
				171	// Scale phase: phase * scale + 0.5f
				172	phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32);
				173	phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32);
				174
				175	// Compute histogram index.
				176	int32x4x2_t hidx_s32 =
				177	{
				178	{
				179	vcvtq_s32_f32(phase_f32.val[0]),
				180	vcvtq_s32_f32(phase_f32.val[1])
				181	}
				182	};
				183
				184	// Compute magnitude weights (w0 and w1)
				185	const float32x4x2_t hidx_f32 =
				186	{
				187	{
				188	vcvtq_f32_s32(hidx_s32.val[0]),
				189	vcvtq_f32_s32(hidx_s32.val[1])
				190	}
				191	};
				192
				193	float32x4x2_t w1_f32 =
				194	{
				195	{
				196	vsubq_f32(phase_f32.val[0], hidx_f32.val[0]),
				197	vsubq_f32(phase_f32.val[1], hidx_f32.val[1])
				198	}
				199	};
				200
				201	float32x4x2_t w0_f32 =
				202	{
				203	{
				204	vsubq_f32(one_f32, w1_f32.val[0]),
				205	vsubq_f32(one_f32, w1_f32.val[1])
				206	}
				207	};
				208
				209	// Compute contribute for splitting vote
				210	const float32x4x2_t mag_w0_f32 =
				211	{
				212	{
				213	vmulq_f32(mag_f32.val[0], w0_f32.val[0]),
				214	vmulq_f32(mag_f32.val[1], w0_f32.val[1])
				215	}
				216	};
				217
				218	const float32x4x2_t mag_w1_f32 =
				219	{
				220	{
				221	vmulq_f32(mag_f32.val[0], w1_f32.val[0]),
				222	vmulq_f32(mag_f32.val[1], w1_f32.val[1])
				223	}
				224	};
				225
				226	// Weighted vote between 2 bins
				227
				228	// Check if the histogram index is equal to num_bins
				229	uint32x4x2_t mask =
				230	{
				231	{
				232	vceqq_s32(hidx_s32.val[0], num_bins_s32),
				233	vceqq_s32(hidx_s32.val[1], num_bins_s32)
				234	}
				235	};
				236
				237	hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
				238	hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
				239
				240	// First bin - Low
				241	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0);
				242	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1);
				243	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2);
				244	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3);
				245
				246	// First bin - high
				247	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0);
				248	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1);
				249	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2);
				250	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3);
				251
				252	hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32);
				253	hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32);
				254
				255	// Check if the histogram index is equal to num_bins
				256	mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32);
				257	mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32);
				258
				259	hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
				260	hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
				261
				262	// Second bin - Low
				263	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0);
				264	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1);
				265	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2);
				266	*(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3);
				267
				268	// Second bin - high
				269	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0);
				270	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1);
				271	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2);
				272	*(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3);
				273	}
				274
				275	for(; xc < static_cast<int32_t>(cell_width); xc++)
				276	{
				277	const float phase_value = (phase_row_ptr + xc + yc phase_stride) * phase_scale + 0.5f;
				278	const float mag_value = (mag_row_ptr + xc + yc mag_stride);
				279
				280	const float w1 = phase_value - std::floor(phase_value);
				281
				282	// The quantised phase is the histogram index [0, num_bins - 1] - Round
				283	// Check limit of histogram index. If hidx == num_bins, hidx = 0
				284	const size_t hidx = static_cast<size_t>(phase_value) % num_bins;
				285
				286	// Weighted vote between 2 bins
				287	(output_ptr + hidx) += mag_value (1.0f - w1);
				288	(output_ptr + ((hidx + 1) % (num_bins))) += mag_value w1;
				289	}
				290	}
				291	}
				292
				293	void l2_norm(const float __restrict input_row_ptr, float __restrict output_ptr, size_t input_stride,
				294	size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold)
				295	{
				296	ARM_COMPUTE_UNUSED(l2_hyst_threshold);
				297
				298	float sum = 0.0f;
				299	float32x4_t sum_f32 = vdupq_n_f32(0.0f);
				300
				301	// Compute L2-Norm
				302	for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
				303	{
				304	const float const hist_ptr = input_row_ptr + yc input_stride;
				305
				306	int32_t xc = 0;
				307
				308	for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
				309	{
				310	const float32x4x4_t input_value =
				311	{
				312	{
				313	vld1q_f32(hist_ptr + xc + 0),
				314	vld1q_f32(hist_ptr + xc + 4),
				315	vld1q_f32(hist_ptr + xc + 8),
				316	vld1q_f32(hist_ptr + xc + 12)
				317	}
				318	};
				319
				320	// Compute input_value^2
				321	sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
				322	sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
				323	sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
				324	sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
				325
				326	vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
				327	vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
				328	vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
				329	vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
				330	}
				331
				332	// Compute left over
				333	for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
				334	{
				335	const float input_value = hist_ptr[xc];
				336
				337	sum += input_value * input_value;
				338
				339	output_ptr[xc + yc * num_bins_block_x] = input_value;
				340	}
				341	}
				342
				343	sum += vgetq_lane_f32(sum_f32, 0);
				344	sum += vgetq_lane_f32(sum_f32, 1);
				345	sum += vgetq_lane_f32(sum_f32, 2);
				346	sum += vgetq_lane_f32(sum_f32, 3);
				347
				348	const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
				349	const float32x4_t scale_f32 = vdupq_n_f32(scale);
				350
				351	int32_t i = 0;
				352
				353	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				354	{
				355	float32x4x4_t input_value =
				356	{
				357	{
				358	vld1q_f32(&output_ptr[i + 0]),
				359	vld1q_f32(&output_ptr[i + 4]),
				360	vld1q_f32(&output_ptr[i + 8]),
				361	vld1q_f32(&output_ptr[i + 12])
				362	}
				363	};
				364
				365	// Scale input_value
				366	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				367	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				368	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				369	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				370
				371	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				372	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				373	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				374	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				375	}
				376
				377	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				378	{
				379	output_ptr[i] *= scale;
				380	}
				381	}
				382
				383	void l2hys_norm(const float __restrict input_row_ptr, float __restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
				384	float l2_hyst_threshold)
				385	{
				386	float sum = 0.0f;
				387	float32x4_t sum_f32 = vdupq_n_f32(0.0f);
				388
				389	// Compute L2-Hys
				390	for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
				391	{
				392	const float const hist_ptr = input_row_ptr + yc input_stride;
				393
				394	int32_t xc = 0;
				395
				396	for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
				397	{
				398	const float32x4x4_t input_value =
				399	{
				400	{
				401	vld1q_f32(hist_ptr + xc + 0),
				402	vld1q_f32(hist_ptr + xc + 4),
				403	vld1q_f32(hist_ptr + xc + 8),
				404	vld1q_f32(hist_ptr + xc + 12)
				405	}
				406	};
				407
				408	// Compute input_value^2
				409	sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
				410	sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
				411	sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
				412	sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
				413
				414	vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
				415	vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
				416	vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
				417	vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
				418	}
				419
				420	// Compute left over
				421	for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
				422	{
				423	const float input_value = hist_ptr[xc];
				424
				425	sum += input_value * input_value;
				426
				427	output_ptr[xc + yc * num_bins_block_x] = input_value;
				428	}
				429	}
				430
				431	sum += vgetq_lane_f32(sum_f32, 0);
				432	sum += vgetq_lane_f32(sum_f32, 1);
				433	sum += vgetq_lane_f32(sum_f32, 2);
				434	sum += vgetq_lane_f32(sum_f32, 3);
				435
				436	float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
				437	float32x4_t scale_f32 = vdupq_n_f32(scale);
				438	const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
				439
				440	// Reset sum
				441	sum_f32 = vdupq_n_f32(0.0f);
				442	sum = 0.0f;
				443
				444	int32_t i = 0;
				445
				446	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				447	{
				448	float32x4x4_t input_value =
				449	{
				450	{
				451	vld1q_f32(&output_ptr[i + 0]),
				452	vld1q_f32(&output_ptr[i + 4]),
				453	vld1q_f32(&output_ptr[i + 8]),
				454	vld1q_f32(&output_ptr[i + 12])
				455	}
				456	};
				457
				458	// Scale input_value
				459	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				460	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				461	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				462	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				463
				464	// Clip input_value if over _threshold_l2hys
				465	input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
				466	input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
				467	input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
				468	input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
				469
				470	// Compute input_value^2
				471	sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
				472	sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
				473	sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
				474	sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
				475
				476	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				477	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				478	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				479	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				480	}
				481
				482	sum += vgetq_lane_f32(sum_f32, 0);
				483	sum += vgetq_lane_f32(sum_f32, 1);
				484	sum += vgetq_lane_f32(sum_f32, 2);
				485	sum += vgetq_lane_f32(sum_f32, 3);
				486
				487	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				488	{
				489	float input_value = output_ptr[i] * scale;
				490
				491	// Clip scaled input_value if over _threshold_L2hys
				492	input_value = std::min(input_value, l2_hyst_threshold);
				493
				494	sum += input_value * input_value;
				495
				496	output_ptr[i] = input_value;
				497	}
				498
				499	// We use the same constants of OpenCV
				500	scale = 1.0f / (std::sqrt(sum) + 1e-3f);
				501	scale_f32 = vdupq_n_f32(scale);
				502
				503	// Rescale
				504	i = 0;
				505
				506	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				507	{
				508	float32x4x4_t input_value =
				509	{
				510	{
				511	vld1q_f32(&output_ptr[i + 0]),
				512	vld1q_f32(&output_ptr[i + 4]),
				513	vld1q_f32(&output_ptr[i + 8]),
				514	vld1q_f32(&output_ptr[i + 12])
				515	}
				516	};
				517
				518	// Scale input_value
				519	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				520	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				521	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				522	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				523
				524	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				525	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				526	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				527	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				528	}
				529
				530	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				531	{
				532	// Store result
				533	output_ptr[i] *= scale;
				534	}
				535	}
				536
				537	void l1_norm(const float __restrict input_row_ptr, float __restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
				538	float l2_hyst_threshold)
				539	{
				540	ARM_COMPUTE_UNUSED(l2_hyst_threshold);
				541
				542	float sum = 0.0f;
				543	float32x4_t sum_f32 = vdupq_n_f32(0.0f);
				544
				545	// Compute L1-Norm
				546	for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
				547	{
				548	const float const hist_ptr = input_row_ptr + yc input_stride;
				549
				550	int32_t xc = 0;
				551
				552	for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
				553	{
				554	const float32x4x4_t input_value =
				555	{
				556	{
				557	vld1q_f32(hist_ptr + xc + 0),
				558	vld1q_f32(hist_ptr + xc + 4),
				559	vld1q_f32(hist_ptr + xc + 8),
				560	vld1q_f32(hist_ptr + xc + 12)
				561	}
				562	};
				563
				564	// Compute \|input_value\|
				565	sum_f32 += vabsq_f32(input_value.val[0]);
				566	sum_f32 += vabsq_f32(input_value.val[1]);
				567	sum_f32 += vabsq_f32(input_value.val[2]);
				568	sum_f32 += vabsq_f32(input_value.val[3]);
				569
				570	vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
				571	vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
				572	vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
				573	vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
				574	}
				575
				576	for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
				577	{
				578	const float input_value = hist_ptr[xc];
				579
				580	sum += std::abs(input_value);
				581
				582	output_ptr[xc + yc * num_bins_block_x] = input_value;
				583	}
				584	}
				585
				586	sum += vgetq_lane_f32(sum_f32, 0);
				587	sum += vgetq_lane_f32(sum_f32, 1);
				588	sum += vgetq_lane_f32(sum_f32, 2);
				589	sum += vgetq_lane_f32(sum_f32, 3);
				590
				591	const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
				592	const float32x4_t scale_f32 = vdupq_n_f32(scale);
				593
				594	int32_t i = 0;
				595
				596	for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
				597	{
				598	float32x4x4_t input_value =
				599	{
				600	{
				601	vld1q_f32(&output_ptr[i + 0]),
				602	vld1q_f32(&output_ptr[i + 4]),
				603	vld1q_f32(&output_ptr[i + 8]),
				604	vld1q_f32(&output_ptr[i + 12])
				605	}
				606	};
				607
				608	// Scale input_value
				609	input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
				610	input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
				611	input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
				612	input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
				613
				614	vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
				615	vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
				616	vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
				617	vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
				618	}
				619
				620	for(; i < static_cast<int32_t>(num_bins_block); ++i)
				621	{
				622	output_ptr[i] *= scale;
				623	}
				624	}
				625	} // namespace
				626
				627	NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel()
				628	: _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0)
				629	{
				630	}
				631
				632	void NEHOGOrientationBinningKernel::configure(const ITensor input_magnitude, const ITensor input_phase, ITensor output, const HOGInfo hog_info)
				633	{
				634	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
				635	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
				636	ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
				637	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
				638	ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
				639	ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
				640
				641	_input_magnitude = input_magnitude;
				642	_input_phase = input_phase;
				643	_output = output;
				644	_cell_width = hog_info->cell_size().width;
				645	_cell_height = hog_info->cell_size().height;
				646	_num_bins = hog_info->num_bins();
				647	_phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f);
				648	_phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
				649
				650	if(_cell_width < 8)
				651	{
				652	_func = &cell_width_lt8;
				653	}
				654	else
				655	{
				656	_func = &cell_width_ge8;
				657	}
				658
				659	constexpr unsigned int num_elems_processed_per_iteration = 1;
				660	const unsigned int num_elems_read_per_iteration = 1;
				661	const unsigned int num_rows_read_per_iteration = _cell_height;
				662	const unsigned int num_elems_written_per_iteration = 1;
				663
				664	// Configure kernel window
				665	Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
				666	AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
				667
				668	update_window_and_padding(win,
				669	AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
				670	AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
				671	output_access);
				672
				673	output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
				674
				675	INEKernel::configure(win);
				676	}
				677
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	678	void NEHOGOrientationBinningKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	679	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	680	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	681	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				682	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				683	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				684
				685	const size_t mag_stride = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format());
				686	const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format());
				687
				688	Window win_mag(window);
				689	win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width));
				690	win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height));
				691
				692	Window win_phase(win_mag);
				693
				694	Iterator mag(_input_magnitude, win_mag);
				695	Iterator phase(_input_phase, win_phase);
				696	Iterator out(_output, window);
				697
				698	execute_window_loop(window, [&](const Coordinates & id)
				699	{
				700	const auto mag_row_ptr = reinterpret_cast<const int16_t *>(mag.ptr());
				701	const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
				702	const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
				703
				704	(*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale);
				705	},
				706	mag, phase, out);
				707	}
				708
				709	NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel()
				710	: _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f)
				711	{
				712	}
				713
				714	void NEHOGBlockNormalizationKernel::configure(const ITensor input, ITensor output, const HOGInfo *hog_info)
				715	{
				716	ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
				717	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
				718	ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
				719
				720	// Number of cells per block
				721	const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
				722	hog_info->block_size().height / hog_info->cell_size().height);
				723
				724	// Number of cells per block stride
				725	const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
				726	hog_info->block_stride().height / hog_info->cell_size().height);
				727
				728	_input = input;
				729	_output = output;
				730	_l2_hyst_threshold = hog_info->l2_hyst_threshold();
				731	_num_cells_per_block = num_cells_per_block;
				732	_num_cells_per_block_stride = num_cells_per_block_stride;
				733	_num_bins = hog_info->num_bins();
				734
				735	ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height)));
				736
				737	switch(hog_info->normalization_type())
				738	{
				739	case HOGNormType::L2_NORM:
				740	_func = &l2_norm;
				741	break;
				742	case HOGNormType::L2HYS_NORM:
				743	_func = &l2hys_norm;
				744	break;
				745	case HOGNormType::L1_NORM:
				746	_func = &l1_norm;
				747	break;
				748	default:
				749	ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
				750	break;
				751	}
				752
				753	constexpr unsigned int num_elems_processed_per_iteration = 1;
				754	const unsigned int num_elems_read_per_iteration = 1;
				755	const unsigned int num_rows_read_per_iteration = _num_cells_per_block.height;
				756	const unsigned int num_elems_written_per_iteration = 1;
				757	const unsigned int num_rows_written_per_iteration = _num_cells_per_block.height;
				758
				759	// Configure kernel window
				760	Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
				761	AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
				762
				763	update_window_and_padding(win,
				764	AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
				765	output_access);
				766
				767	output_access.set_valid_region(win, input->info()->valid_region());
				768
				769	INEKernel::configure(win);
				770	}
				771
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	772	void NEHOGBlockNormalizationKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	773	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	774	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	775	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				776	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				777	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				778
				779	// Get number of bins per block
				780	const size_t num_bins_per_block = _output->info()->num_channels();
				781
				782	// Number of bins on the same row of the block
				783	const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
				784
				785	const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
				786
				787	Window win_in(window);
				788	win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
				789	win_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
				790
				791	Iterator in(_input, win_in);
				792	Iterator out(_output, window);
				793
				794	// Normalises blocks
				795	execute_window_loop(window, [&](const Coordinates & id)
				796	{
				797	const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr());
				798	const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
				799
				800	// Execute normalization function
				801	(*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
				802	},
				803	in, out);
				804	}