blob: 374005d897e982b78e5b5fcb58cc164cbbe4fabd [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +00002 * Copyright (c) 2017-2020 ARM Limited.
Gian Marco Iodiceab182122017-10-09 15:05:40 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
29#include "arm_compute/core/ITensor.h"
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +010030#include "arm_compute/core/NEON/wrapper/wrapper.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010031#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Types.h"
33#include "arm_compute/core/Utils.h"
34#include "arm_compute/core/Validate.h"
35#include "arm_compute/core/Window.h"
36
Gian Marco Iodiceab182122017-10-09 15:05:40 +010037#include <cstddef>
38#include <cstdint>
39
40using namespace arm_compute;
41
42namespace arm_compute
43{
44class Coordinates;
45} // namespace arm_compute
46
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000047namespace
48{
Georgios Pinitas631c41a2017-12-06 11:53:03 +000049Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000050{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +010051 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000052 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
53
Georgios Pinitas631c41a2017-12-06 11:53:03 +000054 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000055}
Georgios Pinitas631c41a2017-12-06 11:53:03 +000056std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000057{
58 const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
59
60 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
61
62 AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
63 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
64
65 bool window_changed = update_window_and_padding(win, input_access, output_access);
66
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +000067 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000068
Georgios Pinitas631c41a2017-12-06 11:53:03 +000069 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000070 return std::make_pair(err, win);
71}
72
Georgios Pinitas631c41a2017-12-06 11:53:03 +000073Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000074{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +010075 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000076 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
77
Georgios Pinitas631c41a2017-12-06 11:53:03 +000078 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000079}
80
Georgios Pinitas631c41a2017-12-06 11:53:03 +000081std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000082{
83 constexpr unsigned int num_elems_processed_per_iteration = 16;
84
85 // Configure kernel window
86 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
87
88 AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
89 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
90
91 bool window_changed = update_window_and_padding(win, input_access, output_access);
92
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +000093 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000094
Georgios Pinitas631c41a2017-12-06 11:53:03 +000095 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000096 return std::make_pair(err, win);
97}
98} // namespace
99
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100100INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
101 : _input(), _output(), _k(0), _is_reshaped(false)
102{
103}
104
Gian Marcoe75a02b2017-11-08 12:24:09 +0000105void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100106{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000107 // Perform validate step
108 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
109 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100110
Gian Marcoe75a02b2017-11-08 12:24:09 +0000111 _input = mtx_a;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100112 _output = vector_sum_row;
113 _k = num_mtx_a_cols;
114 _is_reshaped = is_interleaved4x4;
115
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100116 // Configure kernel window
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000117 auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
118 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
119 INEKernel::configure(win_config.second);
120}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100121
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000122Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000123{
124 ARM_COMPUTE_UNUSED(num_mtx_a_cols);
125 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
126 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100127
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000128 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100129}
130
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100131template <typename T>
132void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100133{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100134 // Intermediate and final accumulator types
135 using TIAcc = wrapper::traits::promote_t<T>;
136 using TAcc = wrapper::traits::promote_t<TIAcc>;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100137
138 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
139
140 Window win_input(collapsed_window);
141 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
142 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
143 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
144
145 Iterator in(_input, win_input);
146 Iterator out(_output, collapsed_window);
147
148 if(_is_reshaped)
149 {
150 execute_window_loop(collapsed_window, [&](const Coordinates & id)
151 {
152 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100153 auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100154
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100155 const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100156
157#if __arm__
158 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
159#endif /* __arm__ */
160
161 int i = 0;
162 // This for loop performs 4 accumulations
163 for(; i <= (_k - 4); i += 4)
164 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100165 const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100166
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100167 // Convert 8-bit to 16-bit
168 typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100169 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100170 wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
171 wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
172 wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
173 wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100174 };
175
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100176 // Accumulate to 16-bit
177 a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
178 a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
179 a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100180
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100181 // Accumulate to 32-bit
182 sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100183 }
184
185 // This for loop performs the leftover accumulations
186 for(; i < _k; ++i)
187 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100188 const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100189
190 // Convert U8 to U16
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100191 const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100192
193 // Accumulate to U32
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100194 sum_row = wrapper::vaddw(sum_row, a0_d16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100195 }
196
197 auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
198
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000199 wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100200 },
201 in, out);
202 }
203 else // it is not reshaped
204 {
205 execute_window_loop(collapsed_window, [&](const Coordinates & id)
206 {
207 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100208 auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
209 TAcc sum_row = 0;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100210
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100211 const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100212
213#if __arm__
214 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
215#endif /* __arm__ */
216
217 int i = 0;
218 // This for loop performs 16 accumulations
219 for(; i <= (_k - 16); i += 16)
220 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100221 const auto a0_d8 = wrapper::vloadq(matrix_a + i);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100222
223 // Partial accumulations in U16
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100224 const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100225
226 // Accumulate to U32
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100227 vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100228 }
229
230 // This for loop performs the leftover accumulations
231 for(; i < _k; ++i)
232 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100233 sum_row += static_cast<TAcc>(matrix_a[i]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100234 }
235
236#if defined(__aarch64__)
237 // Reduction operation available on 64 bit architectures only
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100238 sum_row += wrapper::vaddv(vsum_row);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100239#else // __aarch64__
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100240 auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
241 tmp = wrapper::vpadd(tmp, tmp);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100242
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100243 sum_row += wrapper::vgetlane(tmp, 0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100244#endif // __aarch64__
245
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100246 *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100247 },
248 in, out);
249 }
250}
251
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100252void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
253{
254 ARM_COMPUTE_UNUSED(info);
255 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
256 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
257
258 switch(_input->info()->data_type())
259 {
260 case DataType::QASYMM8:
261 run_internal<uint8_t>(window);
262 break;
263 case DataType::QASYMM8_SIGNED:
264 case DataType::QSYMM8_PER_CHANNEL:
265 run_internal<int8_t>(window);
266 break;
267 default:
268 ARM_COMPUTE_ERROR("Unsupported data type");
269 }
270}
271
Gian Marcoe75a02b2017-11-08 12:24:09 +0000272void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100273{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000274 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
275 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100276
Gian Marcoe75a02b2017-11-08 12:24:09 +0000277 _input = mtx_b;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100278 _output = vector_sum_col;
279 _k = num_mtx_b_rows;
280 _is_reshaped = is_transposed1xW;
281
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100282 // Configure kernel window
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000283 auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
284 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
285 INEKernel::configure(win_config.second);
286}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100287
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000288Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000289{
290 ARM_COMPUTE_UNUSED(num_mtx_b_rows);
291 ARM_COMPUTE_UNUSED(is_transposed1xW);
292 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
293 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100294
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000295 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100296}
297
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100298template <typename T>
299void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100300{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100301 // Intermediate and final accumulator types
302 using TIAcc = wrapper::traits::promote_t<T>;
303 using TAcc = wrapper::traits::promote_t<TIAcc>;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100304
305 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
306
307 if(_is_reshaped)
308 {
309 Window win_input(collapsed_window);
310 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
311 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
312 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
313
314 Iterator in(_input, win_input);
315 Iterator out(_output, collapsed_window);
316
317 execute_window_loop(collapsed_window, [&](const Coordinates & id)
318 {
319 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100320 typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100321 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100322 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
323 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
324 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
325 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100326 };
327
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100328 const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100329
330#if __arm__
331 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
332#endif /* __arm__ */
333
334 int i = 0;
335 for(; i < _k; ++i)
336 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100337 const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100338
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100339 // Convert 8bit to 16bit
340 const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100341 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100342 wrapper::vmovl(wrapper::vgetlow(b0_b8)),
343 wrapper::vmovl(wrapper::vgethigh(b0_b8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100344 };
345
346 // Accumulate to U32
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100347 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
348 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
349 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
350 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100351 }
352
353 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
354
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000355 wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
356 wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
357 wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
358 wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100359 },
360 in, out);
361 }
362 else // it is not reshaped
363 {
364 const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
365 const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
366
367 // The implementation computes 16 elements per iteration
368 const int window_start_x = 16 * info.thread_id;
369 const int window_step_x = 16 * info.num_threads;
370 // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
371 const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
372
373 Window win_out(collapsed_window);
374 win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
375
376 Window win_in(win_out);
377 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
378 win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
379
380 Iterator inb(_input, win_in);
381 Iterator out(_output, win_out);
382
383 execute_window_loop(win_out, [&](const Coordinates & id)
384 {
385 if(id.x() > width_matrix_b)
386 {
387 return;
388 }
389
390 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100391 typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100392 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100393 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
394 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
395 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
396 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100397 };
398
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100399 const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100400
401#if __arm__
402 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
403 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
404#endif /* __arm__ */
405
406 int i = 0;
407 // This for loop performs 4 accumulations
408 for(; i <= (_k - 4); i += 4)
409 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100410 const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
411 const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
412 const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
413 const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100414
415#if __arm__
416 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
417 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
418 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
419 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
420#endif /* __arm__ */
421
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100422 // Partial accumulation in 16bit
423 typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100424 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100425 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
426 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100427 };
428
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100429 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
430 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
431 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
432 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
433 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
434 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
435 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
436 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100437
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100438 // Accumulate to 32bit
439 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
440 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
441 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
442 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100443
444 matrix_b += 4 * in_b_stride;
445 }
446
447 // This for loop perfoms the leftover accumulations
448 for(; i < _k; ++i)
449 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100450 const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100451
Pablo Tello6ff12a02017-11-02 16:09:35 +0000452 // Convert S8 to S16
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100453 const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100454 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100455 wrapper::vmovl(wrapper::vgetlow(b0_b8)),
456 wrapper::vmovl(wrapper::vgethigh(b0_b8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100457 };
458
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100459 // Accumulate to 32bit
460 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
461 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
462 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
463 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100464
465 matrix_b += in_b_stride;
466 }
467
468 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
469
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000470 wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
471 wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
472 wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
473 wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100474 },
475 inb, out);
476 }
Pablo Tello6ff12a02017-11-02 16:09:35 +0000477}
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100478
479void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
480{
481 ARM_COMPUTE_UNUSED(info);
482 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
483 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
484
485 switch(_input->info()->data_type())
486 {
487 case DataType::QASYMM8:
488 run_internal<uint8_t>(window, info);
489 break;
490 case DataType::QASYMM8_SIGNED:
491 case DataType::QSYMM8_PER_CHANNEL:
492 run_internal<int8_t>(window, info);
493 break;
494 default:
495 ARM_COMPUTE_ERROR("Unsupported data type");
496 }
497}