blob: bac27430f972ea510c60bc1e27f01fe8d39ead35 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEConvolutionKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
26#include "arm_compute/core/Coordinates.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Utils.h"
33#include "arm_compute/core/Validate.h"
34#include "arm_compute/core/Window.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010035#include "src/core/helpers/AutoConfiguration.h"
36#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037
38#include <algorithm>
39#include <arm_neon.h>
40#include <array>
41#include <cstdint>
42#include <cstring>
43#include <tuple>
44
45namespace arm_compute
46{
47namespace
48{
49const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
50
51inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output)
52{
53 const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
54 vqmovn_s32(out2));
55 vst1q_s16(output, s16results);
56}
57
58inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output)
59{
60 const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
61 vqmovun_s32(out2)));
62 vst1_u8(output, u8results);
63}
64
65inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output)
66{
67 const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
68 const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
69 vst1q_s16(output, s16results);
70}
71
72inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output)
73{
74 const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
75 vqmovn_u32(out2)));
76 vst1_u8(output, u8results);
77}
78
79inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output)
80{
81 vst1q_s16(output, out);
82 vst1q_s16(output + 8, out2);
83}
84
85inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output)
86{
87 const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
88 vqmovun_s16(out2));
89 vst1q_u8(output, u8results);
90}
91
92inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output)
93{
94 const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
95 vqmovn_u16(out2));
96 vst1q_u8(output, u8results);
97}
98
99inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output)
100{
101 vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
102 vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
103}
104
105inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2)
106{
107 // Convert to s16 and split in blocks of 4 values:
108 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
109 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
110
111 const int16x4x3_t row =
112 {
113 {
114 vget_low_s16(s16_tmp0),
115 vget_high_s16(s16_tmp0),
116 vget_low_s16(s16_tmp1)
117 }
118 };
119
120 // Calculate row left value for pixels [0,3]
121 out = vmlal_s16(out, row.val[0], mat0);
122 // Calculate row middle value for pixels [0,3]
123 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
124 // Calculate row right value for pixels [0,3]
125 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
126
127 // Calculate row left value for pixels [4,7]
128 out2 = vmlal_s16(out2, row.val[1], mat0);
129 // Calculate row middle value for pixels [4,7]
130 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
131 // Calculate row right value for pixels [4,7]
132 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
133}
134
135inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
136{
137 const int16x4_t mat0 = vld1_dup_s16(convolution);
138 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
139 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
140
141 convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
142}
143
144inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
145{
146 const int16x4_t mat0 = vld1_dup_s16(convolution);
147 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
148 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
149 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
150 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
151
152 // Convert to s16 and split in blocks of 4 values:
153 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
154 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
155
156 const int16x4x3_t row =
157 {
158 {
159 vget_low_s16(s16_tmp0),
160 vget_high_s16(s16_tmp0),
161 vget_low_s16(s16_tmp1)
162 }
163 };
164
165 // Calculate row left 2 value for pixels [0,3]
166 out = vmlal_s16(out, row.val[0], mat0);
167 // Calculate row left 1 value for pixels [0,3]
168 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
169 // Calculate row middle value for pixels [0,3]
170 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
171 // Calculate row right +1 value for pixels [0,3]
172 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
173 // Calculate row right +2 value for pixels [0,3]
174 out = vmlal_s16(out, row.val[1], mat4);
175
176 // Calculate row left 2 value for pixels [4,7]
177 out2 = vmlal_s16(out2, row.val[1], mat0);
178 // Calculate row left 1 value for pixels [4,7]
179 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
180 // Calculate row middle value for pixels [4,7]
181 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
182 // Calculate row right +1 value for pixels [4,7]
183 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
184 // Calculate row right +2 value for pixels [4,7]
185 out2 = vmlal_s16(out2, row.val[2], mat4);
186}
187
188inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
189{
190 const int16x4_t mat0 = vld1_dup_s16(convolution);
191 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
192 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
193 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
194 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
195 const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
196 const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
197
198 // Convert to s16 and split in blocks of 4 values:
199 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
200 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
201
202 const int16x4x4_t row =
203 {
204 {
205 vget_low_s16(s16_tmp0),
206 vget_high_s16(s16_tmp0),
207 vget_low_s16(s16_tmp1),
208 vget_high_s16(s16_tmp1)
209 }
210 };
211
212 // Calculate row left 3 value for pixels [0,3]
213 out = vmlal_s16(out, row.val[0], mat0);
214 // Calculate row left 2 value for pixels [0,3]
215 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
216 // Calculate row left 1 value for pixels [0,3]
217 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
218 // Calculate row middle value for pixels [0,3]
219 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
220 // Calculate row right +1 value for pixels [0,3]
221 out = vmlal_s16(out, row.val[1], mat4);
222 // Calculate row right +2 value for pixels [0,3]
223 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
224 // Calculate row right +3 value for pixels [0,3]
225 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
226
227 // Calculate row left 3 value for pixels [4,7]
228 out2 = vmlal_s16(out2, row.val[1], mat0);
229 // Calculate row left 2 value for pixels [4,7]
230 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
231 // Calculate row left 1 value for pixels [4,7]
232 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
233 // Calculate row middle value for pixels [4,7]
234 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
235 // Calculate row right +1 value for pixels [4,7]
236 out2 = vmlal_s16(out2, row.val[2], mat4);
237 // Calculate row right +2 value for pixels [4,7]
238 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
239 // Calculate row right +3 value for pixels [4,7]
240 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
241}
242
243inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution)
244{
245 const int16x4_t mat0 = vld1_dup_s16(convolution);
246 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
247 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
248 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
249 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
250 const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
251 const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
252 const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
253 const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
254
255 // Convert to s16 and split in blocks of 4 values:
256 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
257 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
258
259 const int16x4x4_t row =
260 {
261 {
262 vget_low_s16(s16_tmp0),
263 vget_high_s16(s16_tmp0),
264 vget_low_s16(s16_tmp1),
265 vget_high_s16(s16_tmp1)
266 }
267 };
268
269 // Calculate row left 4 value for pixels [0,3]
270 out = vmlal_s16(out, row.val[0], mat0);
271 // Calculate row left 3 value for pixels [0,3]
272 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
273 // Calculate row left 2 value for pixels [0,3]
274 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
275 // Calculate row left 1 value for pixels [0,3]
276 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
277 // Calculate row middle value for pixels [0,3]
278 out = vmlal_s16(out, row.val[1], mat4);
279 // Calculate row right +1 value for pixels [0,3]
280 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
281 // Calculate row right +2 value for pixels [0,3]
282 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
283 // Calculate row right +3 value for pixels [0,3]
284 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
285 // Calculate row right +4 value for pixels [0,3]
286 out = vmlal_s16(out, row.val[2], mat8);
287
288 // Calculate row left 4 value for pixels [0,3]
289 out2 = vmlal_s16(out2, row.val[1], mat0);
290 // Calculate row left 3 value for pixels [0,3]
291 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
292 // Calculate row left 2 value for pixels [0,3]
293 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
294 // Calculate row left 1 value for pixels [0,3]
295 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
296 // Calculate row middle value for pixels [0,3]
297 out2 = vmlal_s16(out2, row.val[2], mat4);
298 // Calculate row right +1 value for pixels [0,3]
299 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
300 // Calculate row right +2 value for pixels [0,3]
301 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
302 // Calculate row right +3 value for pixels [0,3]
303 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
304 // Calculate row right +4 value for pixels [0,3]
305 out2 = vmlal_s16(out2, row.val[3], mat8);
306}
307} // namespace
308
309/****************************************************************************************\
310 * Square Convolution *
311\****************************************************************************************/
312
313template <unsigned int matrix_size>
314NEConvolutionKernel<matrix_size>::NEConvolutionKernel()
315 : INESimpleKernel(), _scale(0), _convolution{ {} }
316{
317}
318
319template <unsigned int matrix_size>
320BorderSize NEConvolutionKernel<matrix_size>::border_size() const
321{
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100322 return BorderSize{ matrix_size / 2 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100323}
324
325template <unsigned int matrix_size>
326void NEConvolutionKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
327{
328 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
329
330 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
331
332 ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
333 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
334 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
335
336 _input = input;
337 _output = output;
338
339 std::copy_n(conv, _convolution.size(), _convolution.begin());
340
341 if(scale == 0)
342 {
343 _scale = calculate_matrix_scale(_convolution.data(), matrix_size);
344 }
345 else
346 {
347 _scale = scale;
348 }
349
350 // Configure kernel window
351 constexpr unsigned int num_elems_processed_per_iteration = 8;
352 constexpr unsigned int num_elems_read_per_iteration = 16;
353 constexpr unsigned int num_elems_written_per_iteration = 8;
354
355 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
356 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
357
358 update_window_and_padding(win,
359 AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size),
360 output_access);
361
362 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
363
364 INEKernel::configure(win);
365}
366
367template <>
368template <typename OutputType>
369void NEConvolutionKernel<3>::convolution(const Window &win)
370{
371 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
372 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
373
374 Iterator input(_input, win);
375 Iterator output(_output, win);
376
377 // Load the matrix's coefficients into NEON registers:
378 const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
379 const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
380 const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
381 const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
382 const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
383 const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
384 const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
385 const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
386 const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
387 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
388
389 const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1));
390 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0));
391 const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1));
392
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100393 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100394 {
395 int32x4_t out = vdupq_n_s32(0);
396 int32x4_t out2 = vdupq_n_s32(0);
397
398 // Load 16 bytes from the top row:
399 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
400 convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
401
402 // Load 16 bytes from the middle row:
403 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
404 convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
405
406 // Load 16 bytes from the middle row:
407 const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
408 convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
409
410 // Apply scale
411 if(_scale != 1)
412 {
413 // Convert to F32, scale and convert back to S32
414 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
415 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
416 }
417
418 // Clamp and store as U8 or S16:
419 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
420 },
421 input, output);
422}
423
424template <>
425template <typename OutputType>
426void NEConvolutionKernel<5>::convolution(const Window &win)
427{
428 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
429 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
430
431 Iterator input(_input, win);
432 Iterator output(_output, win);
433
434 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
435
436 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2));
437 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1));
438 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0));
439 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1));
440 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2));
441
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100442 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100443 {
444 int32x4_t out = vdupq_n_s32(0);
445 int32x4_t out2 = vdupq_n_s32(0);
446
447 // Load 16 bytes from the top2 row:
448 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
449 convolve_row5x1(out, out2, data_t2, _convolution.data());
450
451 // Load 16 bytes from the top1 row:
452 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
453 convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
454
455 // Load 16 bytes from the middle row:
456 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
457 convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
458
459 // Load 16 bytes from the low1 row:
460 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
461 convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
462
463 // Load 16 bytes from the low2 row:
464 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
465 convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
466
467 // Apply scale
468 if(_scale != 1)
469 {
470 // Convert to F32, scale and convert back to S32
471 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
472 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
473 }
474
475 // Clamp and store as U8 or S16:
476 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
477 },
478 input, output);
479}
480
481template <>
482template <typename OutputType>
483void NEConvolutionKernel<7>::convolution(const Window &win)
484{
485 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
486 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
487
488 Iterator input(_input, win);
489 Iterator output(_output, win);
490
491 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
492
493 const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3));
494 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2));
495 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1));
496 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0));
497 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1));
498 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2));
499 const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3));
500
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100501 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100502 {
503 int32x4_t out = vdupq_n_s32(0);
504 int32x4_t out2 = vdupq_n_s32(0);
505
506 // Load 16 bytes from the top3 row:
507 const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
508 convolve_row7x1(out, out2, data_t3, _convolution.data());
509
510 // Load 16 bytes from the top2 row:
511 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
512 convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
513
514 // Load 16 bytes from the top1 row:
515 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
516 convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
517
518 // Load 16 bytes from the middle row:
519 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
520 convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
521
522 // Load 16 bytes from the low1 row:
523 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
524 convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
525
526 // Load 16 bytes from the low2 row:
527 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
528 convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
529
530 // Load 16 bytes from the low3 row:
531 const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
532 convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
533
534 // Apply scale
535 if(_scale != 1)
536 {
537 // Convert to F32, scale and convert back to S32
538 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
539 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
540 }
541
542 // Clamp and store as U8 or S16:
543 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
544 },
545 input, output);
546}
547
548template <>
549template <typename OutputType>
550void NEConvolutionKernel<9>::convolution(const Window &win)
551{
552 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
553 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
554
555 Iterator input(_input, win);
556 Iterator output(_output, win);
557
558 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
559
560 const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4));
561 const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3));
562 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2));
563 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1));
564 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0));
565 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1));
566 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2));
567 const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3));
568 const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4));
569
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100570 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100571 {
572 int32x4_t out = vdupq_n_s32(0);
573 int32x4_t out2 = vdupq_n_s32(0);
574
575 // Load 16 bytes from the top4 row:
576 const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
577 convolve_row9x1(out, out2, data_t4, _convolution.data());
578
579 // Load 16 bytes from the top3 row:
580 const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
581 convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
582
583 // Load 16 bytes from the top2 row:
584 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
585 convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
586
587 // Load 16 bytes from the top1 row:
588 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
589 convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
590
591 // Load 16 bytes from the middle row:
592 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
593 convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
594
595 // Load 16 bytes from the low1 row:
596 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
597 convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
598
599 // Load 16 bytes from the low2 row:
600 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
601 convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
602
603 // Load 16 bytes from the low3 row:
604 const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
605 convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
606
607 // Load 16 bytes from the low4 row:
608 const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
609 convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
610
611 // Apply scale
612 if(_scale != 1)
613 {
614 // Convert to F32, scale and convert back to S32
615 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
616 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
617 }
618
619 // Clamp and store as U8 or S16:
620 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
621 },
622 input, output);
623}
624
625template <unsigned int matrix_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100626void NEConvolutionKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100627{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100628 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100629 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
630 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
631
Sanghoon Leec8a85ba2017-11-29 11:23:14 +0000632 switch(_output->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100633 {
Sanghoon Leec8a85ba2017-11-29 11:23:14 +0000634 case DataType::U8:
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100635 convolution<uint8_t>(window);
636 break;
Sanghoon Leec8a85ba2017-11-29 11:23:14 +0000637 case DataType::S16:
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100638 convolution<int16_t>(window);
639 break;
640 default:
Sanghoon Leec8a85ba2017-11-29 11:23:14 +0000641 ARM_COMPUTE_ERROR("Not supported Data type!");
642 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100643 }
644}
645
646template class arm_compute::NEConvolutionKernel<3>;
647template class arm_compute::NEConvolutionKernel<5>;
648template class arm_compute::NEConvolutionKernel<7>;
649template class arm_compute::NEConvolutionKernel<9>;
650
651/****************************************************************************************\
652 * Separable Square Convolution *
653\****************************************************************************************/
654
655template <unsigned int matrix_size>
656NESeparableConvolutionHorKernel<matrix_size>::NESeparableConvolutionHorKernel()
657 : _conv_row{ { 0 } }, _border_size(0)
658{
659}
660
661template <unsigned int matrix_size>
662BorderSize NESeparableConvolutionHorKernel<matrix_size>::border_size() const
663{
664 return _border_size;
665}
666
667template <unsigned int matrix_size>
668void NESeparableConvolutionHorKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
669{
670 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row);
671
672 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
673
674 ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
675 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
676 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
677
678 _input = input;
679 _output = output;
680 std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
681 _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
682
683 // Configure kernel window
684 constexpr unsigned int num_elems_processed_per_iteration = 8;
685 constexpr unsigned int num_elems_read_per_iteration = 16;
686 constexpr unsigned int num_elems_written_per_iteration = 8;
687
688 Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
689 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
690
691 update_window_and_padding(win,
692 AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
693 output_access);
694
695 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
696
697 INEKernel::configure(win);
698}
699
700template <unsigned int matrix_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100701void NESeparableConvolutionHorKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100702{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100703 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100704 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
705 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
706 switch(_output->info()->data_type())
707 {
708 case DataType::U16:
709 convolve<uint16_t>(window);
710 break;
711 case DataType::S16:
712 convolve<int16_t>(window);
713 break;
714 case DataType::S32:
715 convolve<int32_t>(window);
716 break;
717 default:
718 ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
719 break;
720 }
721}
722
723template <>
724template <>
725inline void NESeparableConvolutionHorKernel<5>::convolve<uint16_t>(const Window &window)
726{
727 Window win_in(window);
728 win_in.shift(Window::DimX, -2);
729
730 Iterator input(_input, win_in);
731 Iterator output(_output, window);
732
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100733 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100734 {
735 const uint8x16_t data = vld1q_u8(input.ptr());
736
737 const uint16x8x2_t data_u16 =
738 {
739 {
740 vmovl_u8(vget_low_u8(data)),
741 vmovl_u8(vget_high_u8(data))
742 }
743 };
744
745 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
746 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
747 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
748 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
749 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
750
751 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
752 },
753 input, output);
754}
755
756template <>
757template <>
758inline void NESeparableConvolutionHorKernel<5>::convolve<int16_t>(const Window &window)
759{
760 Window win_in(window);
761 win_in.shift(Window::DimX, -2);
762
763 Iterator input(_input, win_in);
764 Iterator output(_output, window);
765
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100766 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100767 {
768 const uint8x16_t data = vld1q_u8(input.ptr());
769
770 const int16x8x2_t data_s16 =
771 {
772 {
773 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
774 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
775 }
776 };
777
778 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
779 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
780 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
781 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
782 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
783
784 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
785 },
786 input, output);
787}
788
789template <>
790template <>
791void NESeparableConvolutionHorKernel<5>::convolve<int32_t>(const Window &window)
792{
793 Window win_in(window);
794 win_in.shift(Window::DimX, -2);
795
796 Iterator input(_input, win_in);
797 Iterator output(_output, window);
798
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100799 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100800 {
801 const uint8x16_t data = vld1q_u8(input.ptr());
802
803 const int16x8x2_t data_s16 =
804 {
805 {
806 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
807 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
808 }
809 };
810
811 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
812 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
813 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
814 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
815
816 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
817 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
818 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
819 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
820 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
821
822 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
823
824 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
825 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
826 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
827 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
828 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
829
830 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
831 },
832 input, output);
833}
834
835template <>
836template <>
837inline void NESeparableConvolutionHorKernel<7>::convolve<uint16_t>(const Window &window)
838{
839 Window win_in(window);
840 win_in.shift(Window::DimX, -3);
841
842 Iterator input(_input, win_in);
843 Iterator output(_output, window);
844
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100845 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100846 {
847 const uint8x16_t data = vld1q_u8(input.ptr());
848
849 const uint16x8x2_t data_u16 =
850 {
851 {
852 vmovl_u8(vget_low_u8(data)),
853 vmovl_u8(vget_high_u8(data))
854 }
855 };
856
857 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
858 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
859 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
860 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
861 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
862 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
863 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
864
865 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
866 },
867 input, output);
868}
869
870template <>
871template <>
872inline void NESeparableConvolutionHorKernel<7>::convolve<int16_t>(const Window &window)
873{
874 Window win_in(window);
875 win_in.shift(Window::DimX, -3);
876
877 Iterator input(_input, win_in);
878 Iterator output(_output, window);
879
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100880 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100881 {
882 const uint8x16_t data = vld1q_u8(input.ptr());
883
884 const int16x8x2_t data_s16 =
885 {
886 {
887 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
888 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
889 }
890 };
891
892 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
893 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
894 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
895 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
896 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
897 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
898 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
899
900 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
901 },
902 input, output);
903}
904
905template <>
906template <>
907void NESeparableConvolutionHorKernel<7>::convolve<int32_t>(const Window &window)
908{
909 Window win_in(window);
910 win_in.shift(Window::DimX, -3);
911
912 Iterator input(_input, win_in);
913 Iterator output(_output, window);
914
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100915 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100916 {
917 const uint8x16_t data = vld1q_u8(input.ptr());
918
919 const int16x8x2_t data_s16 =
920 {
921 {
922 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
923 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
924 }
925 };
926
927 const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
928 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
929 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
930 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
931 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
932 const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
933
934 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
935 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
936 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
937 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
938 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
939 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
940 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
941
942 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
943
944 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
945 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
946 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
947 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
948 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
949 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
950 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
951
952 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
953 },
954 input, output);
955}
956
957template <>
958template <>
959inline void NESeparableConvolutionHorKernel<9>::convolve<uint16_t>(const Window &window)
960{
961 Window win_in(window);
962 win_in.shift(Window::DimX, -4);
963
964 Iterator input(_input, win_in);
965 Iterator output(_output, window);
966
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100967 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100968 {
969 const uint8x16_t data = vld1q_u8(input.ptr());
970
971 const uint16x8x2_t data_u16 =
972 {
973 {
974 vmovl_u8(vget_low_u8(data)),
975 vmovl_u8(vget_high_u8(data))
976 }
977 };
978
979 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
980 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
981 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
982 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
983 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
984 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
985 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
986 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
987 out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
988
989 vst1q_u16(reinterpret_cast<uint16_t *>(output.ptr()), out);
990 },
991 input, output);
992}
993
994template <>
995template <>
996inline void NESeparableConvolutionHorKernel<9>::convolve<int16_t>(const Window &window)
997{
998 Window win_in(window);
999 win_in.shift(Window::DimX, -4);
1000
1001 Iterator input(_input, win_in);
1002 Iterator output(_output, window);
1003
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001004 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001005 {
1006 const uint8x16_t data = vld1q_u8(input.ptr());
1007
1008 const int16x8x2_t data_s16 =
1009 {
1010 {
1011 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
1012 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
1013 }
1014 };
1015
1016 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
1017 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
1018 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
1019 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
1020 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
1021 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
1022 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
1023 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
1024 out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
1025
1026 vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
1027 },
1028 input, output);
1029}
1030
1031template <>
1032template <>
1033void NESeparableConvolutionHorKernel<9>::convolve<int32_t>(const Window &window)
1034{
1035 Window win_in(window);
1036 win_in.shift(Window::DimX, -4);
1037
1038 Iterator input(_input, win_in);
1039 Iterator output(_output, window);
1040
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001041 execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001042 {
1043 const uint8x16_t data = vld1q_u8(input.ptr());
1044
1045 const int16x8x2_t data_s16 =
1046 {
1047 {
1048 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
1049 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
1050 }
1051 };
1052
1053 const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
1054 const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
1055 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
1056 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
1057 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
1058 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
1059 const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
1060
1061 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
1062 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
1063 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
1064 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
1065 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
1066 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
1067 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
1068 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
1069 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
1070
1071 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), out_low);
1072
1073 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
1074 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
1075 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
1076 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
1077 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
1078 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
1079 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
1080 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
1081 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
1082
1083 vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, out_high);
1084 },
1085 input, output);
1086}
1087
1088template class arm_compute::NESeparableConvolutionHorKernel<5>;
1089template class arm_compute::NESeparableConvolutionHorKernel<7>;
1090template class arm_compute::NESeparableConvolutionHorKernel<9>;
1091
1092template <unsigned int matrix_size>
1093NESeparableConvolutionVertKernel<matrix_size>::NESeparableConvolutionVertKernel()
1094 : _conv_col{ { 0 } }, _scale(0)
1095{
1096}
1097
1098template <unsigned int matrix_size>
1099BorderSize NESeparableConvolutionVertKernel<matrix_size>::border_size() const
1100{
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001101 return BorderSize{ matrix_size / 2, 0 };
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001102}
1103
1104template <unsigned int matrix_size>
1105void NESeparableConvolutionVertKernel<matrix_size>::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
1106{
1107 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col);
1108
1109 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
1110
1111 ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
1112 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
1113 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
1114 ARM_COMPUTE_ERROR_ON(scale == 0);
1115
1116 _input = input;
1117 _output = output;
1118 std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
1119 _scale = scale;
1120
1121 // Configure kernel window
1122 constexpr unsigned int num_elems_processed_per_iteration = 16;
1123 constexpr unsigned int num_elems_read_per_iteration = 16;
1124 constexpr unsigned int num_elems_written_per_iteration = 16;
1125
1126 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
1127 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
1128
1129 update_window_and_padding(win,
1130 AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size),
1131 output_access);
1132
1133 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
1134
1135 INEKernel::configure(win);
1136}
1137
1138template <unsigned int matrix_size>
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001139void NESeparableConvolutionVertKernel<matrix_size>::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001140{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001141 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001142 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1143 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1144
1145 switch(_input->info()->data_type())
1146 {
1147 case DataType::U16:
1148 switch(_output->info()->data_type())
1149 {
1150 case DataType::U8:
1151 convolution_u16<uint8_t>(window);
1152 break;
1153 case DataType::S16:
1154 convolution_u16<int16_t>(window);
1155 break;
1156 default:
1157 ARM_COMPUTE_ERROR("Not supported");
1158 }
1159 break;
1160 case DataType::S16:
1161 switch(_output->info()->data_type())
1162 {
1163 case DataType::U8:
1164 convolution_s16<uint8_t>(window);
1165 break;
1166 case DataType::S16:
1167 convolution_s16<int16_t>(window);
1168 break;
1169 default:
1170 ARM_COMPUTE_ERROR("Not supported");
1171 }
1172 break;
1173 case DataType::S32:
1174 switch(_output->info()->data_type())
1175 {
1176 case DataType::U8:
1177 convolution_s32<uint8_t>(window);
1178 break;
1179 case DataType::S16:
1180 convolution_s32<int16_t>(window);
1181 break;
1182 default:
1183 ARM_COMPUTE_ERROR("Not supported");
1184 }
1185 break;
1186 default:
1187 ARM_COMPUTE_ERROR("Unsupported intermediate data type!");
1188 break;
1189 }
1190}
1191
1192template <unsigned int matrix_size>
1193template <typename OutputType>
1194void NESeparableConvolutionVertKernel<matrix_size>::convolution_u16(const Window &win)
1195{
1196 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1197
1198 Window win_in(win);
1199 win_in.set_dimension_step(Window::DimX, 8);
1200
1201 Iterator in(_input, win_in);
1202 Iterator out(_output, win);
1203
1204 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1205 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1206 const int k_half = matrix_size / 2;
1207
1208 // Set row pointers
1209 for(int i = -k_half; i <= k_half; ++i)
1210 {
1211 input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
1212 }
1213
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001214 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001215 {
1216 uint16x8_t out0 = vdupq_n_u16(0);
1217 uint16x8_t out1 = vdupq_n_u16(0);
1218
1219 // First half
1220 for(unsigned int r = 0; r < matrix_size; ++r)
1221 {
1222 const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
1223 out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
1224 }
1225
1226 in.increment(Window::DimX);
1227
1228 // Second half
1229 for(unsigned int r = 0; r < matrix_size; ++r)
1230 {
1231 const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.offset()));
1232 out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
1233 }
1234
1235 //scale the result if needed
1236 if(_scale != 1)
1237 {
1238 float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
1239 float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
1240 out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
1241 out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
1242 store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
1243
1244 float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
1245 float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
1246 out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
1247 out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
1248 store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
1249 }
1250 else
1251 {
1252 store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
1253 }
1254 },
1255 in, out);
1256}
1257
1258template <unsigned int matrix_size>
1259template <typename OutputType>
1260void NESeparableConvolutionVertKernel<matrix_size>::convolution_s16(const Window &win)
1261{
1262 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1263
1264 Window win_in(win);
1265 win_in.set_dimension_step(Window::DimX, 8);
1266
1267 Iterator in(_input, win_in);
1268 Iterator out(_output, win);
1269
1270 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1271 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1272 const int k_half = matrix_size / 2;
1273
1274 // Set row pointers
1275 for(int i = -k_half; i <= k_half; ++i)
1276 {
1277 input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
1278 }
1279
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001280 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001281 {
1282 int16x8_t out0 = vdupq_n_s16(0);
1283 int16x8_t out1 = vdupq_n_s16(0);
1284
1285 // First half
1286 for(unsigned int r = 0; r < matrix_size; ++r)
1287 {
1288 const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
1289 out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
1290 }
1291
1292 in.increment(Window::DimX);
1293
1294 // Second half
1295 for(unsigned int r = 0; r < matrix_size; ++r)
1296 {
1297 const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.offset()));
1298 out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
1299 }
1300
1301 //scale the result if needed
1302 if(_scale != 1)
1303 {
1304 float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
1305 float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
1306 out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
1307 out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
1308 store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.ptr()));
1309
1310 float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
1311 float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
1312 out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
1313 out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
1314 store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.ptr()) + 8);
1315 }
1316 else
1317 {
1318 store_results(out0, out1, reinterpret_cast<OutputType *>(out.ptr()));
1319 }
1320 },
1321 in, out);
1322}
1323
1324template <unsigned int matrix_size>
1325template <typename OutputType>
1326void NESeparableConvolutionVertKernel<matrix_size>::convolution_s32(const Window &win)
1327{
1328 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1329
1330 Window win_in(win);
1331 win_in.set_dimension_step(Window::DimX, 8);
1332
1333 Iterator in(_input, win_in);
1334 Iterator out(_output, win);
1335
1336 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1337 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1338 const int k_half = matrix_size / 2;
1339
1340 // Set row pointers
1341 for(int i = -k_half; i <= k_half; ++i)
1342 {
1343 input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i));
1344 }
1345
1346 const int32x4_t zero = vdupq_n_s32(0);
1347
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001348 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001349 {
1350 int32x4x2_t out0 =
1351 {
1352 {
1353 zero,
1354 zero
1355 }
1356 };
1357
1358 int32x4x2_t out1 =
1359 {
1360 {
1361 zero,
1362 zero
1363 }
1364 };
1365
1366 // First half
1367 for(unsigned int r = 0; r < matrix_size; ++r)
1368 {
1369 const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
1370 out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
1371 out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
1372 }
1373
1374 in.increment(Window::DimX);
1375
1376 // Second half
1377 for(unsigned int r = 0; r < matrix_size; ++r)
1378 {
1379 const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.offset()));
1380 out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
1381 out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
1382 }
1383
1384 //scale the result if needed
1385 if(_scale != 1)
1386 {
1387 float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
1388 float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
1389 out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
1390 out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
1391 out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
1392 out0.val[1] = vcvtq_s32_f32(out0_f32_even);
1393
1394 float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
1395 float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
1396 out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
1397 out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
1398 out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
1399 out1.val[1] = vcvtq_s32_f32(out1_f32_even);
1400 }
1401
1402 const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
1403 store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()));
1404
1405 const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
1406 store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.ptr()) + 8);
1407 },
1408 in, out);
1409}
1410
1411template class arm_compute::NESeparableConvolutionVertKernel<5>;
1412template class arm_compute::NESeparableConvolutionVertKernel<7>;
1413template class arm_compute::NESeparableConvolutionVertKernel<9>;
1414
1415/****************************************************************************************\
1416 * Rectangle Convolution *
1417\****************************************************************************************/
1418
1419NEConvolutionRectangleKernel::NEConvolutionRectangleKernel()
1420 : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
1421{
1422}
1423
1424BorderSize NEConvolutionRectangleKernel::border_size() const
1425{
1426 return _border_size;
1427}
1428
1429void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
1430{
1431 ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv);
1432
1433 set_shape_if_empty(*output->info(), input->info()->tensor_shape());
1434
1435 ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
1436 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
1437 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
1438 ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9);
1439 ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9);
1440 ARM_COMPUTE_ERROR_ON(0 == scale);
1441
1442 _input = input;
1443 _output = output;
1444 _scale = scale;
1445 _border_size = BorderSize(height / 2, width / 2);
1446
1447 // Setup the convolution matrix
1448 const uint32_t nr_elements = width * height;
1449 _convolution.resize(nr_elements);
1450 std::copy_n(conv, nr_elements, _convolution.begin());
1451
1452 // Set function index to help choose appropriate function in run()
1453 _func_idx = get_index(height) * 4 + get_index(width);
1454 ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes));
1455
1456 // Configure kernel window
1457 constexpr unsigned int num_elems_processed_per_iteration = 8;
1458 constexpr unsigned int num_elems_read_per_iteration = 16;
1459 constexpr unsigned int num_elems_written_per_iteration = 8;
1460
Diego Lopez Recas0021d752017-12-18 14:42:56 +00001461 Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size);
1462 AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001463
1464 update_window_and_padding(win,
1465 AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height),
1466 output_access);
1467
1468 output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size);
1469
1470 INEKernel::configure(win);
1471}
1472
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001473void NEConvolutionRectangleKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001474{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001475 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001476 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1477 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1478
1479 using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window);
1480
1481 // uint8_t function table
1482 static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
1483 {
1484 {
1485 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
1486 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
1487 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
1488 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
1489 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
1490 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
1491 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
1492 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
1493 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
1494 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
1495 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
1496 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
1497 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
1498 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
1499 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
1500 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
1501 }
1502 };
1503 // int16_t function table
1504 static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
1505 {
1506 {
1507 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
1508 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
1509 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
1510 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
1511 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
1512 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
1513 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
1514 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
1515 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
1516 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
1517 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
1518 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
1519 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
1520 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
1521 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
1522 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
1523 }
1524 };
1525
1526 // Run appropriate function
Sanghoon Leed7ba5392017-12-13 11:28:50 +00001527 switch(_output->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001528 {
Sanghoon Leed7ba5392017-12-13 11:28:50 +00001529 case DataType::U8:
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001530 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
1531 (this->*func_table_u8[_func_idx])(window);
1532 break;
Sanghoon Leed7ba5392017-12-13 11:28:50 +00001533 case DataType::S16:
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001534 ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
1535 (this->*func_table_s16[_func_idx])(window);
1536 break;
1537 default:
1538 ARM_COMPUTE_ERROR("Not supported");
1539 }
1540}
1541
1542unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
1543{
1544 switch(val)
1545 {
1546 case 3:
1547 return 0;
1548 case 5:
1549 return 1;
1550 case 7:
1551 return 2;
1552 case 9:
1553 return 3;
1554 default:
1555 ARM_COMPUTE_ERROR("Not supported dimension size");
1556 return 0;
1557 }
1558}
1559
1560template <typename OutputType, unsigned int rows, unsigned int cols>
1561void NEConvolutionRectangleKernel::convolution(const Window &win)
1562{
1563 static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16");
1564 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
1565
1566 Iterator input(_input, win);
1567 Iterator output(_output, win);
1568
1569 std::array<unsigned char *, rows> input_ptrs{ {} };
1570 const int16_t *conv = _convolution.data();
1571 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
1572 const int k_row_half = rows / 2;
1573 const int k_col_half = cols / 2;
1574
1575 // Set row pointers
1576 for(int i = -k_row_half; i <= k_row_half; ++i)
1577 {
1578 input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
1579 }
1580
Michalis Spyroua4f378d2019-04-26 14:54:54 +01001581 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001582 {
1583 int32x4_t out = vdupq_n_s32(0);
1584 int32x4_t out2 = vdupq_n_s32(0);
1585
1586 // Perform appropriate convolution
1587 for(unsigned int r = 0; r < rows; ++r)
1588 {
1589 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
1590 if(3 == cols)
1591 {
1592 convolve_row3x1(out, out2, data, conv + r * cols);
1593 }
1594 else if(5 == cols)
1595 {
1596 convolve_row5x1(out, out2, data, conv + r * cols);
1597 }
1598 else if(7 == cols)
1599 {
1600 convolve_row7x1(out, out2, data, conv + r * cols);
1601 }
1602 else if(9 == cols)
1603 {
1604 convolve_row9x1(out, out2, data, conv + r * cols);
1605 }
1606 else
1607 {
1608 ARM_COMPUTE_ERROR("Unsupported number of columns");
1609 }
1610 }
1611
1612 // Apply scale
1613 if(_scale != 1)
1614 {
1615 // Convert to F32, scale and convert back to S32
1616 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
1617 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
1618 }
1619
1620 // Clamp and store as U8 or S16:
1621 store_results(out, out2, reinterpret_cast<OutputType *>(output.ptr()));
1622 },
1623 input, output);
1624}
1625} // namespace arm_compute