blob: c1ee770db515eb716d1fa2de49cd3187f784cc76 [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +00002 * Copyright (c) 2017, 2018 ARM Limited.
Gian Marco Iodiceab182122017-10-09 15:05:40 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
30#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Utils.h"
33#include "arm_compute/core/Validate.h"
34#include "arm_compute/core/Window.h"
35
36#include <arm_neon.h>
37#include <cstddef>
38#include <cstdint>
39
40using namespace arm_compute;
41
42namespace arm_compute
43{
44class Coordinates;
45} // namespace arm_compute
46
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000047namespace
48{
Georgios Pinitas631c41a2017-12-06 11:53:03 +000049Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000050{
51 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
52 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
53
Georgios Pinitas631c41a2017-12-06 11:53:03 +000054 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000055}
Georgios Pinitas631c41a2017-12-06 11:53:03 +000056std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000057{
58 const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
59
60 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
61
62 AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
63 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
64
65 bool window_changed = update_window_and_padding(win, input_access, output_access);
66
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +000067 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000068
Georgios Pinitas631c41a2017-12-06 11:53:03 +000069 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000070 return std::make_pair(err, win);
71}
72
Georgios Pinitas631c41a2017-12-06 11:53:03 +000073Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000074{
75 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
76 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
77
Georgios Pinitas631c41a2017-12-06 11:53:03 +000078 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000079}
80
Georgios Pinitas631c41a2017-12-06 11:53:03 +000081std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000082{
83 constexpr unsigned int num_elems_processed_per_iteration = 16;
84
85 // Configure kernel window
86 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
87
88 AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
89 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
90
91 bool window_changed = update_window_and_padding(win, input_access, output_access);
92
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +000093 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000094
Georgios Pinitas631c41a2017-12-06 11:53:03 +000095 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000096 return std::make_pair(err, win);
97}
98} // namespace
99
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100100INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
101 : _input(), _output(), _k(0), _is_reshaped(false)
102{
103}
104
Gian Marcoe75a02b2017-11-08 12:24:09 +0000105void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100106{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000107 // Perform validate step
108 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
109 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100110
Gian Marcoe75a02b2017-11-08 12:24:09 +0000111 _input = mtx_a;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100112 _output = vector_sum_row;
113 _k = num_mtx_a_cols;
114 _is_reshaped = is_interleaved4x4;
115
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100116 // Configure kernel window
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000117 auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
118 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
119 INEKernel::configure(win_config.second);
120}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100121
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000122Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000123{
124 ARM_COMPUTE_UNUSED(num_mtx_a_cols);
125 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
126 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100127
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000128 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100129}
130
131void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
132{
133 ARM_COMPUTE_UNUSED(info);
134 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
135 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
136
137 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
138
139 Window win_input(collapsed_window);
140 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
141 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
142 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
143
144 Iterator in(_input, win_input);
145 Iterator out(_output, collapsed_window);
146
147 if(_is_reshaped)
148 {
149 execute_window_loop(collapsed_window, [&](const Coordinates & id)
150 {
151 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000152 uint32x4_t sum_row = vdupq_n_u32(0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100153
Gian Marcoe75a02b2017-11-08 12:24:09 +0000154 const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100155
156#if __arm__
157 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
158#endif /* __arm__ */
159
160 int i = 0;
161 // This for loop performs 4 accumulations
162 for(; i <= (_k - 4); i += 4)
163 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000164 const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100165
166 // Convert U8 to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000167 uint16x4x4_t a0_u16 =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100168 {
169 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000170 vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
171 vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
172 vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
173 vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100174 }
175 };
176
177 // Accumulate to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000178 a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
179 a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
180 a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100181
182 // Accumulate to U32
Gian Marcoe75a02b2017-11-08 12:24:09 +0000183 sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100184 }
185
186 // This for loop performs the leftover accumulations
187 for(; i < _k; ++i)
188 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000189 const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100190
191 // Convert U8 to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000192 const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100193
194 // Accumulate to U32
Gian Marcoe75a02b2017-11-08 12:24:09 +0000195 sum_row = vaddw_u16(sum_row, a0_u16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100196 }
197
198 auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
199
Gian Marcoe75a02b2017-11-08 12:24:09 +0000200 vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100201 },
202 in, out);
203 }
204 else // it is not reshaped
205 {
206 execute_window_loop(collapsed_window, [&](const Coordinates & id)
207 {
208 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000209 uint32x4_t sum_row_u32 = vdupq_n_u32(0);
210 uint32_t sum_row = 0;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100211
Gian Marco05288a22017-11-21 10:57:50 +0000212 const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100213
214#if __arm__
215 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
216#endif /* __arm__ */
217
218 int i = 0;
219 // This for loop performs 16 accumulations
220 for(; i <= (_k - 16); i += 16)
221 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000222 const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100223
224 // Partial accumulations in U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000225 const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100226
227 // Accumulate to U32
Gian Marcoe75a02b2017-11-08 12:24:09 +0000228 sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100229 }
230
231 // This for loop performs the leftover accumulations
232 for(; i < _k; ++i)
233 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000234 sum_row += static_cast<uint32_t>(matrix_a[i]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100235 }
236
237#if defined(__aarch64__)
238 // Reduction operation available on 64 bit architectures only
Gian Marcoe75a02b2017-11-08 12:24:09 +0000239 sum_row += vaddvq_u32(sum_row_u32);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100240#else // __aarch64__
Gian Marcoe75a02b2017-11-08 12:24:09 +0000241 uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
242 tmp = vpadd_u32(tmp, tmp);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100243
Gian Marcoe75a02b2017-11-08 12:24:09 +0000244 sum_row += vget_lane_u32(tmp, 0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100245#endif // __aarch64__
246
247 *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
248 },
249 in, out);
250 }
251}
252
Gian Marcoe75a02b2017-11-08 12:24:09 +0000253void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100254{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000255 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
256 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100257
Gian Marcoe75a02b2017-11-08 12:24:09 +0000258 _input = mtx_b;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100259 _output = vector_sum_col;
260 _k = num_mtx_b_rows;
261 _is_reshaped = is_transposed1xW;
262
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100263 // Configure kernel window
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000264 auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
265 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
266 INEKernel::configure(win_config.second);
267}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100268
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000269Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000270{
271 ARM_COMPUTE_UNUSED(num_mtx_b_rows);
272 ARM_COMPUTE_UNUSED(is_transposed1xW);
273 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
274 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100275
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000276 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100277}
278
279void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
280{
281 ARM_COMPUTE_UNUSED(info);
282 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
283 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
284
285 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
286
287 if(_is_reshaped)
288 {
289 Window win_input(collapsed_window);
290 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
291 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
292 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
293
294 Iterator in(_input, win_input);
295 Iterator out(_output, collapsed_window);
296
297 execute_window_loop(collapsed_window, [&](const Coordinates & id)
298 {
299 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000300 uint32x4x4_t sum_col =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100301 {
302 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000303 vdupq_n_u32(0),
304 vdupq_n_u32(0),
305 vdupq_n_u32(0),
306 vdupq_n_u32(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100307 }
308 };
309
Gian Marcoe75a02b2017-11-08 12:24:09 +0000310 const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100311
312#if __arm__
313 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
314#endif /* __arm__ */
315
316 int i = 0;
317 for(; i < _k; ++i)
318 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000319 const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100320
Pablo Tello6ff12a02017-11-02 16:09:35 +0000321 // Convert S8 to U16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000322 const uint16x8x2_t b0_u16 =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100323 {
324 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000325 vmovl_u8(vget_low_u8(b0_u8)),
326 vmovl_u8(vget_high_u8(b0_u8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100327 }
328 };
329
330 // Accumulate to U32
331 sum_col =
332 {
333 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000334 vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
335 vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
336 vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
337 vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100338 }
339 };
340 }
341
342 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
343
Gian Marcoe75a02b2017-11-08 12:24:09 +0000344 vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
345 vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
346 vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
347 vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100348 },
349 in, out);
350 }
351 else // it is not reshaped
352 {
353 const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
354 const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
355
356 // The implementation computes 16 elements per iteration
357 const int window_start_x = 16 * info.thread_id;
358 const int window_step_x = 16 * info.num_threads;
359 // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
360 const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
361
362 Window win_out(collapsed_window);
363 win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
364
365 Window win_in(win_out);
366 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
367 win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
368
369 Iterator inb(_input, win_in);
370 Iterator out(_output, win_out);
371
372 execute_window_loop(win_out, [&](const Coordinates & id)
373 {
374 if(id.x() > width_matrix_b)
375 {
376 return;
377 }
378
379 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Gian Marcoe75a02b2017-11-08 12:24:09 +0000380 uint32x4x4_t sum_col =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100381 {
382 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000383 vdupq_n_u32(0),
384 vdupq_n_u32(0),
385 vdupq_n_u32(0),
386 vdupq_n_u32(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100387 }
388 };
389
Gian Marcoe75a02b2017-11-08 12:24:09 +0000390 const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100391
392#if __arm__
393 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
394 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
395#endif /* __arm__ */
396
397 int i = 0;
398 // This for loop performs 4 accumulations
399 for(; i <= (_k - 4); i += 4)
400 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000401 const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
402 const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
403 const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
404 const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100405
406#if __arm__
407 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
408 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
409 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
410 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
411#endif /* __arm__ */
412
413 // Partial accumulation in u16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000414 uint16x8x2_t tmp_sum =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100415 {
416 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000417 vdupq_n_u16(0),
418 vdupq_n_u16(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100419 }
420 };
421
Gian Marcoe75a02b2017-11-08 12:24:09 +0000422 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
423 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
424 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
425 tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
426 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
427 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
428 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
429 tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100430
431 // Accumulate to U32
432 sum_col =
433 {
434 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000435 vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
436 vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
437 vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
438 vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100439 }
440 };
441
442 matrix_b += 4 * in_b_stride;
443 }
444
445 // This for loop perfoms the leftover accumulations
446 for(; i < _k; ++i)
447 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000448 const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100449
Pablo Tello6ff12a02017-11-02 16:09:35 +0000450 // Convert S8 to S16
Gian Marcoe75a02b2017-11-08 12:24:09 +0000451 const uint16x8x2_t b0_u16 =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100452 {
453 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000454 vmovl_u8(vget_low_u8(b0_u8)),
455 vmovl_u8(vget_high_u8(b0_u8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100456 }
457 };
458
459 // Accumulate to U32
460 sum_col =
461 {
462 {
Gian Marcoe75a02b2017-11-08 12:24:09 +0000463 vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
464 vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
465 vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
466 vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100467 }
468 };
469
470 matrix_b += in_b_stride;
471 }
472
473 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
474
Gian Marcoe75a02b2017-11-08 12:24:09 +0000475 vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
476 vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
477 vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
478 vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100479 },
480 inb, out);
481 }
Pablo Tello6ff12a02017-11-02 16:09:35 +0000482}