blob: 089cd34e0cbdff93e6abd0ee9e9502918beb8be8 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/HOGInfo.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/IAccessWindow.h"
30#include "arm_compute/core/Validate.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010031#include "src/core/helpers/AutoConfiguration.h"
32#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033
34#include <algorithm>
35#include <arm_neon.h>
36#include <cstring>
37
38using namespace arm_compute;
39
40namespace
41{
42void cell_width_lt8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr,
43 size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale)
44{
45 const float32x4_t scale_f32 = vdupq_n_f32(phase_scale);
46 static const float32x4_t one_f32 = vdupq_n_f32(1.0f);
47 static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
48 static const int32x4_t zero_s32 = vdupq_n_s32(0);
49 static const int32x4_t one_s32 = vdupq_n_s32(1);
50 const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
51
52 memset(output_ptr, 0, sizeof(float) * num_bins);
53
54 for(size_t yc = 0; yc < cell_height; ++yc)
55 {
56 int32_t xc = 0;
57
58 for(; xc <= static_cast<int32_t>(cell_width) - 4; xc += 4)
59 {
60 // Load magnitude and phase values
61 const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
62 const int16x4_t mag_s16 = vld1_s16(mag_row_ptr + xc + yc * mag_stride);
63
64 // Convert magnitude and phase to float
65 const float32x4_t mag_f32 = vcvtq_f32_s32(vmovl_s16(mag_s16));
66 float32x4_t phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8))));
67
68 // Scale phase: phase * scale + 0.5f
69 phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32);
70
71 // Compute histogram index.
72 int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
73
74 // Compute magnitude weights (w0 and w1)
75 const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
76
77 // w1 = phase_f32 - hidx_f32
78 const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
79
80 // w0 = 1.0 - w1
81 const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32);
82
83 // Compute contribute for splitting vote
84 const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32);
85 const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32);
86
87 // Weighted vote between 2 bins
88
89 // Check if the histogram index is equal to num_bins. If so, replace the index with 0
90 uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
91 hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
92
93 // Bin 0
94 *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
95 *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
96 *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2);
97 *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3);
98
99 hidx_s32 = vaddq_s32(hidx_s32, one_s32);
100
101 // Check if the histogram index is equal to num_bins
102 mask = vceqq_s32(hidx_s32, num_bins_s32);
103 hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
104
105 // Bin1
106 *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0);
107 *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1);
108 *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2);
109 *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
110 }
111
112 for(; xc < static_cast<int32_t>(cell_width); ++xc)
113 {
114 const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
115 const float mag_value = *(mag_row_ptr + xc + yc * mag_stride);
116
117 const float w1 = phase_value - std::floor(phase_value);
118
119 // The quantised phase is the histogram index [0, num_bins - 1] - Round
120 // Check limit of histogram index. If hidx == num_bins, hidx = 0
121 const auto hidx = static_cast<size_t>(phase_value) % num_bins;
122
123 // Weighted vote between 2 bins
124 *(output_ptr + hidx) += mag_value * (1.0f - w1);
125 *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
126 }
127 }
128}
129
130void cell_width_ge8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
131 size_t cell_height, size_t num_bins, float phase_scale)
132{
133 const float32x4_t scale_f32 = vdupq_n_f32(phase_scale);
134 static const float32x4_t one_f32 = vdupq_n_f32(1.0f);
135 static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
136 static const int32x4_t zero_s32 = vdupq_n_s32(0);
137 static const int32x4_t one_s32 = vdupq_n_s32(1);
138 const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins);
139
140 memset(output_ptr, 0, sizeof(float) * num_bins);
141
142 for(size_t yc = 0; yc < cell_height; ++yc)
143 {
144 int32_t xc = 0;
145
146 for(; xc <= static_cast<int32_t>(cell_width) - 8; xc += 8)
147 {
148 // Load magnitude and phase values
149 const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
150 const int16x8_t mag_s16 = vld1q_s16(mag_row_ptr + xc + yc * mag_stride);
151
152 // Convert phase to U16
153 const uint16x8_t phase_u16 = vmovl_u8(phase_u8);
154
155 // Convert magnitude to float32
156 const float32x4x2_t mag_f32 =
157 {
158 {
159 vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))),
160 vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16)))
161 }
162 };
163
164 // Convert phase to float32
165 float32x4x2_t phase_f32 =
166 {
167 {
168 vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))),
169 vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16)))
170 }
171 };
172
173 // Scale phase: phase * scale + 0.5f
174 phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32);
175 phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32);
176
177 // Compute histogram index.
178 int32x4x2_t hidx_s32 =
179 {
180 {
181 vcvtq_s32_f32(phase_f32.val[0]),
182 vcvtq_s32_f32(phase_f32.val[1])
183 }
184 };
185
186 // Compute magnitude weights (w0 and w1)
187 const float32x4x2_t hidx_f32 =
188 {
189 {
190 vcvtq_f32_s32(hidx_s32.val[0]),
191 vcvtq_f32_s32(hidx_s32.val[1])
192 }
193 };
194
195 float32x4x2_t w1_f32 =
196 {
197 {
198 vsubq_f32(phase_f32.val[0], hidx_f32.val[0]),
199 vsubq_f32(phase_f32.val[1], hidx_f32.val[1])
200 }
201 };
202
203 float32x4x2_t w0_f32 =
204 {
205 {
206 vsubq_f32(one_f32, w1_f32.val[0]),
207 vsubq_f32(one_f32, w1_f32.val[1])
208 }
209 };
210
211 // Compute contribute for splitting vote
212 const float32x4x2_t mag_w0_f32 =
213 {
214 {
215 vmulq_f32(mag_f32.val[0], w0_f32.val[0]),
216 vmulq_f32(mag_f32.val[1], w0_f32.val[1])
217 }
218 };
219
220 const float32x4x2_t mag_w1_f32 =
221 {
222 {
223 vmulq_f32(mag_f32.val[0], w1_f32.val[0]),
224 vmulq_f32(mag_f32.val[1], w1_f32.val[1])
225 }
226 };
227
228 // Weighted vote between 2 bins
229
230 // Check if the histogram index is equal to num_bins
231 uint32x4x2_t mask =
232 {
233 {
234 vceqq_s32(hidx_s32.val[0], num_bins_s32),
235 vceqq_s32(hidx_s32.val[1], num_bins_s32)
236 }
237 };
238
239 hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
240 hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
241
242 // First bin - Low
243 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0);
244 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1);
245 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2);
246 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3);
247
248 // First bin - high
249 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0);
250 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1);
251 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2);
252 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3);
253
254 hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32);
255 hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32);
256
257 // Check if the histogram index is equal to num_bins
258 mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32);
259 mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32);
260
261 hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
262 hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
263
264 // Second bin - Low
265 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0);
266 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1);
267 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2);
268 *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3);
269
270 // Second bin - high
271 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0);
272 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1);
273 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2);
274 *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3);
275 }
276
277 for(; xc < static_cast<int32_t>(cell_width); xc++)
278 {
279 const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
280 const float mag_value = *(mag_row_ptr + xc + yc * mag_stride);
281
282 const float w1 = phase_value - std::floor(phase_value);
283
284 // The quantised phase is the histogram index [0, num_bins - 1] - Round
285 // Check limit of histogram index. If hidx == num_bins, hidx = 0
286 const size_t hidx = static_cast<size_t>(phase_value) % num_bins;
287
288 // Weighted vote between 2 bins
289 *(output_ptr + hidx) += mag_value * (1.0f - w1);
290 *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
291 }
292 }
293}
294
295void l2_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride,
296 size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold)
297{
298 ARM_COMPUTE_UNUSED(l2_hyst_threshold);
299
300 float sum = 0.0f;
301 float32x4_t sum_f32 = vdupq_n_f32(0.0f);
302
303 // Compute L2-Norm
304 for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
305 {
306 const float *const hist_ptr = input_row_ptr + yc * input_stride;
307
308 int32_t xc = 0;
309
310 for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
311 {
312 const float32x4x4_t input_value =
313 {
314 {
315 vld1q_f32(hist_ptr + xc + 0),
316 vld1q_f32(hist_ptr + xc + 4),
317 vld1q_f32(hist_ptr + xc + 8),
318 vld1q_f32(hist_ptr + xc + 12)
319 }
320 };
321
322 // Compute input_value^2
323 sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
324 sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
325 sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
326 sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
327
328 vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
329 vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
330 vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
331 vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
332 }
333
334 // Compute left over
335 for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
336 {
337 const float input_value = hist_ptr[xc];
338
339 sum += input_value * input_value;
340
341 output_ptr[xc + yc * num_bins_block_x] = input_value;
342 }
343 }
344
345 sum += vgetq_lane_f32(sum_f32, 0);
346 sum += vgetq_lane_f32(sum_f32, 1);
347 sum += vgetq_lane_f32(sum_f32, 2);
348 sum += vgetq_lane_f32(sum_f32, 3);
349
350 const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
351 const float32x4_t scale_f32 = vdupq_n_f32(scale);
352
353 int32_t i = 0;
354
355 for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
356 {
357 float32x4x4_t input_value =
358 {
359 {
360 vld1q_f32(&output_ptr[i + 0]),
361 vld1q_f32(&output_ptr[i + 4]),
362 vld1q_f32(&output_ptr[i + 8]),
363 vld1q_f32(&output_ptr[i + 12])
364 }
365 };
366
367 // Scale input_value
368 input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
369 input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
370 input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
371 input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
372
373 vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
374 vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
375 vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
376 vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
377 }
378
379 for(; i < static_cast<int32_t>(num_bins_block); ++i)
380 {
381 output_ptr[i] *= scale;
382 }
383}
384
385void l2hys_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
386 float l2_hyst_threshold)
387{
388 float sum = 0.0f;
389 float32x4_t sum_f32 = vdupq_n_f32(0.0f);
390
391 // Compute L2-Hys
392 for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
393 {
394 const float *const hist_ptr = input_row_ptr + yc * input_stride;
395
396 int32_t xc = 0;
397
398 for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
399 {
400 const float32x4x4_t input_value =
401 {
402 {
403 vld1q_f32(hist_ptr + xc + 0),
404 vld1q_f32(hist_ptr + xc + 4),
405 vld1q_f32(hist_ptr + xc + 8),
406 vld1q_f32(hist_ptr + xc + 12)
407 }
408 };
409
410 // Compute input_value^2
411 sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
412 sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
413 sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
414 sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
415
416 vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
417 vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
418 vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
419 vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
420 }
421
422 // Compute left over
423 for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
424 {
425 const float input_value = hist_ptr[xc];
426
427 sum += input_value * input_value;
428
429 output_ptr[xc + yc * num_bins_block_x] = input_value;
430 }
431 }
432
433 sum += vgetq_lane_f32(sum_f32, 0);
434 sum += vgetq_lane_f32(sum_f32, 1);
435 sum += vgetq_lane_f32(sum_f32, 2);
436 sum += vgetq_lane_f32(sum_f32, 3);
437
438 float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
439 float32x4_t scale_f32 = vdupq_n_f32(scale);
440 const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
441
442 // Reset sum
443 sum_f32 = vdupq_n_f32(0.0f);
444 sum = 0.0f;
445
446 int32_t i = 0;
447
448 for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
449 {
450 float32x4x4_t input_value =
451 {
452 {
453 vld1q_f32(&output_ptr[i + 0]),
454 vld1q_f32(&output_ptr[i + 4]),
455 vld1q_f32(&output_ptr[i + 8]),
456 vld1q_f32(&output_ptr[i + 12])
457 }
458 };
459
460 // Scale input_value
461 input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
462 input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
463 input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
464 input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
465
466 // Clip input_value if over _threshold_l2hys
467 input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
468 input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
469 input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
470 input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
471
472 // Compute input_value^2
473 sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
474 sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
475 sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
476 sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
477
478 vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
479 vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
480 vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
481 vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
482 }
483
484 sum += vgetq_lane_f32(sum_f32, 0);
485 sum += vgetq_lane_f32(sum_f32, 1);
486 sum += vgetq_lane_f32(sum_f32, 2);
487 sum += vgetq_lane_f32(sum_f32, 3);
488
489 for(; i < static_cast<int32_t>(num_bins_block); ++i)
490 {
491 float input_value = output_ptr[i] * scale;
492
493 // Clip scaled input_value if over _threshold_L2hys
494 input_value = std::min(input_value, l2_hyst_threshold);
495
496 sum += input_value * input_value;
497
498 output_ptr[i] = input_value;
499 }
500
501 // We use the same constants of OpenCV
502 scale = 1.0f / (std::sqrt(sum) + 1e-3f);
503 scale_f32 = vdupq_n_f32(scale);
504
505 // Rescale
506 i = 0;
507
508 for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
509 {
510 float32x4x4_t input_value =
511 {
512 {
513 vld1q_f32(&output_ptr[i + 0]),
514 vld1q_f32(&output_ptr[i + 4]),
515 vld1q_f32(&output_ptr[i + 8]),
516 vld1q_f32(&output_ptr[i + 12])
517 }
518 };
519
520 // Scale input_value
521 input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
522 input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
523 input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
524 input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
525
526 vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
527 vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
528 vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
529 vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
530 }
531
532 for(; i < static_cast<int32_t>(num_bins_block); ++i)
533 {
534 // Store result
535 output_ptr[i] *= scale;
536 }
537}
538
539void l1_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
540 float l2_hyst_threshold)
541{
542 ARM_COMPUTE_UNUSED(l2_hyst_threshold);
543
544 float sum = 0.0f;
545 float32x4_t sum_f32 = vdupq_n_f32(0.0f);
546
547 // Compute L1-Norm
548 for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
549 {
550 const float *const hist_ptr = input_row_ptr + yc * input_stride;
551
552 int32_t xc = 0;
553
554 for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
555 {
556 const float32x4x4_t input_value =
557 {
558 {
559 vld1q_f32(hist_ptr + xc + 0),
560 vld1q_f32(hist_ptr + xc + 4),
561 vld1q_f32(hist_ptr + xc + 8),
562 vld1q_f32(hist_ptr + xc + 12)
563 }
564 };
565
566 // Compute |input_value|
567 sum_f32 += vabsq_f32(input_value.val[0]);
568 sum_f32 += vabsq_f32(input_value.val[1]);
569 sum_f32 += vabsq_f32(input_value.val[2]);
570 sum_f32 += vabsq_f32(input_value.val[3]);
571
572 vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
573 vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
574 vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
575 vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
576 }
577
578 for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
579 {
580 const float input_value = hist_ptr[xc];
581
582 sum += std::abs(input_value);
583
584 output_ptr[xc + yc * num_bins_block_x] = input_value;
585 }
586 }
587
588 sum += vgetq_lane_f32(sum_f32, 0);
589 sum += vgetq_lane_f32(sum_f32, 1);
590 sum += vgetq_lane_f32(sum_f32, 2);
591 sum += vgetq_lane_f32(sum_f32, 3);
592
593 const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
594 const float32x4_t scale_f32 = vdupq_n_f32(scale);
595
596 int32_t i = 0;
597
598 for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
599 {
600 float32x4x4_t input_value =
601 {
602 {
603 vld1q_f32(&output_ptr[i + 0]),
604 vld1q_f32(&output_ptr[i + 4]),
605 vld1q_f32(&output_ptr[i + 8]),
606 vld1q_f32(&output_ptr[i + 12])
607 }
608 };
609
610 // Scale input_value
611 input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
612 input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
613 input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
614 input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
615
616 vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
617 vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
618 vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
619 vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
620 }
621
622 for(; i < static_cast<int32_t>(num_bins_block); ++i)
623 {
624 output_ptr[i] *= scale;
625 }
626}
627} // namespace
628
629NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel()
630 : _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0)
631{
632}
633
634void NEHOGOrientationBinningKernel::configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info)
635{
636 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
637 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
638 ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
639 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
640 ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
641 ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
642
643 _input_magnitude = input_magnitude;
644 _input_phase = input_phase;
645 _output = output;
646 _cell_width = hog_info->cell_size().width;
647 _cell_height = hog_info->cell_size().height;
648 _num_bins = hog_info->num_bins();
649 _phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f);
650 _phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
651
652 if(_cell_width < 8)
653 {
654 _func = &cell_width_lt8;
655 }
656 else
657 {
658 _func = &cell_width_ge8;
659 }
660
661 constexpr unsigned int num_elems_processed_per_iteration = 1;
662 const unsigned int num_elems_read_per_iteration = 1;
663 const unsigned int num_rows_read_per_iteration = _cell_height;
664 const unsigned int num_elems_written_per_iteration = 1;
665
666 // Configure kernel window
667 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
668 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
669
670 update_window_and_padding(win,
671 AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
672 AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
673 output_access);
674
675 output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
676
677 INEKernel::configure(win);
678}
679
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100680void NEHOGOrientationBinningKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100681{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100682 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100683 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
684 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
685 ARM_COMPUTE_ERROR_ON(_func == nullptr);
686
687 const size_t mag_stride = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format());
688 const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format());
689
690 Window win_mag(window);
691 win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width));
692 win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height));
693
694 Window win_phase(win_mag);
695
696 Iterator mag(_input_magnitude, win_mag);
697 Iterator phase(_input_phase, win_phase);
698 Iterator out(_output, window);
699
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100700 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100701 {
702 const auto mag_row_ptr = reinterpret_cast<const int16_t *>(mag.ptr());
703 const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
704 const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
705
706 (*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale);
707 },
708 mag, phase, out);
709}
710
711NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel()
712 : _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f)
713{
714}
715
716void NEHOGBlockNormalizationKernel::configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info)
717{
718 ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
719 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
720 ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
721
722 // Number of cells per block
723 const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
724 hog_info->block_size().height / hog_info->cell_size().height);
725
726 // Number of cells per block stride
727 const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
728 hog_info->block_stride().height / hog_info->cell_size().height);
729
730 _input = input;
731 _output = output;
732 _l2_hyst_threshold = hog_info->l2_hyst_threshold();
733 _num_cells_per_block = num_cells_per_block;
734 _num_cells_per_block_stride = num_cells_per_block_stride;
735 _num_bins = hog_info->num_bins();
736
737 ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height)));
738
739 switch(hog_info->normalization_type())
740 {
741 case HOGNormType::L2_NORM:
742 _func = &l2_norm;
743 break;
744 case HOGNormType::L2HYS_NORM:
745 _func = &l2hys_norm;
746 break;
747 case HOGNormType::L1_NORM:
748 _func = &l1_norm;
749 break;
750 default:
751 ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
752 break;
753 }
754
755 constexpr unsigned int num_elems_processed_per_iteration = 1;
756 const unsigned int num_elems_read_per_iteration = 1;
757 const unsigned int num_rows_read_per_iteration = _num_cells_per_block.height;
758 const unsigned int num_elems_written_per_iteration = 1;
759 const unsigned int num_rows_written_per_iteration = _num_cells_per_block.height;
760
761 // Configure kernel window
762 Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
763 AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
764
765 update_window_and_padding(win,
766 AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
767 output_access);
768
John Richardson684cb0f2018-01-09 11:17:00 +0000769 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100770
771 INEKernel::configure(win);
772}
773
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100774void NEHOGBlockNormalizationKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100775{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100776 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100777 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
778 ARM_COMPUTE_ERROR_ON(_func == nullptr);
779 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
780
781 // Get number of bins per block
782 const size_t num_bins_per_block = _output->info()->num_channels();
783
784 // Number of bins on the same row of the block
785 const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
786
787 const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
788
789 Window win_in(window);
790 win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
John Richardson7f4a8192018-02-05 15:12:22 +0000791 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100792
793 Iterator in(_input, win_in);
794 Iterator out(_output, window);
795
796 // Normalises blocks
797 execute_window_loop(window, [&](const Coordinates & id)
798 {
John Richardson7f4a8192018-02-05 15:12:22 +0000799 const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr() + id.y() * _num_cells_per_block_stride.height * _input->info()->strides_in_bytes()[Window::DimY]);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100800 const auto out_row_ptr = reinterpret_cast<float *>(out.ptr());
801
802 // Execute normalization function
803 (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
804 },
805 in, out);
806}