blob: 81d9b5bb81a619d6212a129ba9c84e9fa7f4666a [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Utils.h"
33#include "arm_compute/core/Validate.h"
34#include "arm_compute/core/Window.h"
35
36#include <arm_neon.h>
37#include <cstddef>
38#include <cstdint>
39
40using namespace arm_compute;
41
42namespace arm_compute
43{
44class Coordinates;
45} // namespace arm_compute
46
47INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
48 : _input(), _output(), _k(0), _is_reshaped(false)
49{
50}
51
Gian Marcoe75a02b2017-11-08 12:24:09 +000052void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010053{
Gian Marcoe75a02b2017-11-08 12:24:09 +000054 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
Gian Marco Iodiceab182122017-10-09 15:05:40 +010055 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
56
Gian Marcoe75a02b2017-11-08 12:24:09 +000057 _input = mtx_a;
Gian Marco Iodiceab182122017-10-09 15:05:40 +010058 _output = vector_sum_row;
59 _k = num_mtx_a_cols;
60 _is_reshaped = is_interleaved4x4;
61
62 const unsigned int num_elems_processed_per_iteration = _is_reshaped ? 4 : 1;
63
64 // Configure kernel window
65 Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
66
67 AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
68 AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
69
70 update_window_and_padding(win,
71 input_access,
72 output_access);
73
74 output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
75
76 INEKernel::configure(win);
77}
78
79void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
80{
81 ARM_COMPUTE_UNUSED(info);
82 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
83 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
84
85 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
86
87 Window win_input(collapsed_window);
88 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
89 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
90 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
91
92 Iterator in(_input, win_input);
93 Iterator out(_output, collapsed_window);
94
95 if(_is_reshaped)
96 {
97 execute_window_loop(collapsed_window, [&](const Coordinates & id)
98 {
99 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000100 uint32x4_t sum_row = vdupq_n_u32(0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100101
Gian Marcoe75a02b2017-11-08 12:24:09 +0000102 const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100103
104#if __arm__
105 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
106#endif /* __arm__ */
107
108 int i = 0;
109 // This for loop performs 4 accumulations
110 for(; i <= (_k - 4); i += 4)
111 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000112 const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100113
114 // Convert U8 to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000115 uint16x4x4_t a0_u16 =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100116 {
117 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000118 vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
119 vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
120 vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
121 vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100122 }
123 };
124
125 // Accumulate to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000126 a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
127 a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
128 a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100129
130 // Accumulate to U32
Gian Marcoe75a02b2017-11-08 12:24:09 +0000131 sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100132 }
133
134 // This for loop performs the leftover accumulations
135 for(; i < _k; ++i)
136 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000137 const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100138
139 // Convert U8 to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000140 const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100141
142 // Accumulate to U32
Gian Marcoe75a02b2017-11-08 12:24:09 +0000143 sum_row = vaddw_u16(sum_row, a0_u16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100144 }
145
146 auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
147
Gian Marcoe75a02b2017-11-08 12:24:09 +0000148 vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100149 },
150 in, out);
151 }
152 else // it is not reshaped
153 {
154 execute_window_loop(collapsed_window, [&](const Coordinates & id)
155 {
156 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000157 uint32x4_t sum_row_u32 = vdupq_n_u32(0);
158 uint32_t sum_row = 0;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100159
Gian Marcoe75a02b2017-11-08 12:24:09 +0000160 const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + +id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100161
162#if __arm__
163 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
164#endif /* __arm__ */
165
166 int i = 0;
167 // This for loop performs 16 accumulations
168 for(; i <= (_k - 16); i += 16)
169 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000170 const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100171
172 // Partial accumulations in U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000173 const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100174
175 // Accumulate to U32
Gian Marcoe75a02b2017-11-08 12:24:09 +0000176 sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100177 }
178
179 // This for loop performs the leftover accumulations
180 for(; i < _k; ++i)
181 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000182 sum_row += static_cast<uint32_t>(matrix_a[i]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100183 }
184
185#if defined(__aarch64__)
186 // Reduction operation available on 64 bit architectures only
Gian Marcoe75a02b2017-11-08 12:24:09 +0000187 sum_row += vaddvq_u32(sum_row_u32);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100188#else // __aarch64__
Gian Marcoe75a02b2017-11-08 12:24:09 +0000189 uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
190 tmp = vpadd_u32(tmp, tmp);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100191
Gian Marcoe75a02b2017-11-08 12:24:09 +0000192 sum_row += vget_lane_u32(tmp, 0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100193#endif // __aarch64__
194
195 *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
196 },
197 in, out);
198 }
199}
200
Gian Marcoe75a02b2017-11-08 12:24:09 +0000201void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100202{
Gian Marcoe75a02b2017-11-08 12:24:09 +0000203 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100204 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
205
Gian Marcoe75a02b2017-11-08 12:24:09 +0000206 _input = mtx_b;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100207 _output = vector_sum_col;
208 _k = num_mtx_b_rows;
209 _is_reshaped = is_transposed1xW;
210
211 constexpr unsigned int num_elems_processed_per_iteration = 16;
212
213 // Configure kernel window
214 Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
215
216 AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
217 AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
218
219 update_window_and_padding(win,
220 input_access,
221 output_access);
222
223 output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
224
225 INEKernel::configure(win);
226}
227
228void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
229{
230 ARM_COMPUTE_UNUSED(info);
231 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
232 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
233
234 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
235
236 if(_is_reshaped)
237 {
238 Window win_input(collapsed_window);
239 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
240 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
241 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
242
243 Iterator in(_input, win_input);
244 Iterator out(_output, collapsed_window);
245
246 execute_window_loop(collapsed_window, [&](const Coordinates & id)
247 {
248 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000249 uint32x4x4_t sum_col =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100250 {
251 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000252 vdupq_n_u32(0),
253 vdupq_n_u32(0),
254 vdupq_n_u32(0),
255 vdupq_n_u32(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100256 }
257 };
258
Gian Marcoe75a02b2017-11-08 12:24:09 +0000259 const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100260
261#if __arm__
262 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
263#endif /* __arm__ */
264
265 int i = 0;
266 for(; i < _k; ++i)
267 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000268 const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100269
Pablo Tello6ff12a02017-11-02 16:09:35 +0000270 // Convert S8 to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000271 const uint16x8x2_t b0_u16 =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100272 {
273 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000274 vmovl_u8(vget_low_u8(b0_u8)),
275 vmovl_u8(vget_high_u8(b0_u8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100276 }
277 };
278
279 // Accumulate to U32
280 sum_col =
281 {
282 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000283 vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
284 vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
285 vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
286 vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100287 }
288 };
289 }
290
291 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
292
Gian Marcoe75a02b2017-11-08 12:24:09 +0000293 vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
294 vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
295 vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
296 vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100297 },
298 in, out);
299 }
300 else // it is not reshaped
301 {
302 const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
303 const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
304
305 // The implementation computes 16 elements per iteration
306 const int window_start_x = 16 * info.thread_id;
307 const int window_step_x = 16 * info.num_threads;
308 // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
309 const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
310
311 Window win_out(collapsed_window);
312 win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
313
314 Window win_in(win_out);
315 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
316 win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
317
318 Iterator inb(_input, win_in);
319 Iterator out(_output, win_out);
320
321 execute_window_loop(win_out, [&](const Coordinates & id)
322 {
323 if(id.x() > width_matrix_b)
324 {
325 return;
326 }
327
328 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000329 uint32x4x4_t sum_col =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100330 {
331 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000332 vdupq_n_u32(0),
333 vdupq_n_u32(0),
334 vdupq_n_u32(0),
335 vdupq_n_u32(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100336 }
337 };
338
Gian Marcoe75a02b2017-11-08 12:24:09 +0000339 const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100340
341#if __arm__
342 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
343 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
344#endif /* __arm__ */
345
346 int i = 0;
347 // This for loop performs 4 accumulations
348 for(; i <= (_k - 4); i += 4)
349 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000350 const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
351 const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
352 const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
353 const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100354
355#if __arm__
356 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
357 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
358 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
359 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
360#endif /* __arm__ */
361
362 // Partial accumulation in u16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000363 uint16x8x2_t tmp_sum =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100364 {
365 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000366 vdupq_n_u16(0),
367 vdupq_n_u16(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100368 }
369 };
370
Gian Marcoe75a02b2017-11-08 12:24:09 +0000371 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
372 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
373 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
374 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
375 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
376 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
377 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
378 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100379
380 // Accumulate to U32
381 sum_col =
382 {
383 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000384 vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
385 vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
386 vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
387 vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100388 }
389 };
390
391 matrix_b += 4 * in_b_stride;
392 }
393
394 // This for loop perfoms the leftover accumulations
395 for(; i < _k; ++i)
396 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000397 const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100398
Pablo Tello6ff12a02017-11-02 16:09:35 +0000399 // Convert S8 to S16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000400 const uint16x8x2_t b0_u16 =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100401 {
402 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000403 vmovl_u8(vget_low_u8(b0_u8)),
404 vmovl_u8(vget_high_u8(b0_u8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100405 }
406 };
407
408 // Accumulate to U32
409 sum_col =
410 {
411 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000412 vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
413 vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
414 vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
415 vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100416 }
417 };
418
419 matrix_b += in_b_stride;
420 }
421
422 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
423
Gian Marcoe75a02b2017-11-08 12:24:09 +0000424 vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
425 vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
426 vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
427 vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100428 },
429 inb, out);
430 }
Pablo Tello6ff12a02017-11-02 16:09:35 +0000431}