blob: 29453072a1797bc652ef8ea38f05dfeb945b7f65 [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Gian Marco Iodiceab182122017-10-09 15:05:40 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
25
26#include "arm_compute/core/AccessWindowStatic.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010027#include "arm_compute/core/ITensor.h"
Michele Di Giorgioa602f032020-03-12 19:34:33 +000028#include "arm_compute/core/KernelDescriptors.h"
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +010029#include "arm_compute/core/NEON/wrapper/wrapper.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010030#include "arm_compute/core/TensorInfo.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010031
Gian Marco Iodiceab182122017-10-09 15:05:40 +010032namespace arm_compute
33{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000034namespace
35{
Georgios Pinitas631c41a2017-12-06 11:53:03 +000036Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000037{
Michele Di Giorgioa602f032020-03-12 19:34:33 +000038 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +010039 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000040
Michele Di Giorgioa602f032020-03-12 19:34:33 +000041 if(output->total_size() > 0)
42 {
43 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
44 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
45 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +000046 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000047}
Georgios Pinitas631c41a2017-12-06 11:53:03 +000048std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000049{
50 const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
51
Michele Di Giorgioa602f032020-03-12 19:34:33 +000052 // Output auto initialization if not yet initialized
53 auto_init_if_empty(*output, TensorShape(input->dimension(1)), 1, DataType::S32);
54
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000055 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
56
57 AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
58 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
59
60 bool window_changed = update_window_and_padding(win, input_access, output_access);
61
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +000062 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000063
Georgios Pinitas631c41a2017-12-06 11:53:03 +000064 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000065 return std::make_pair(err, win);
66}
67
Georgios Pinitas631c41a2017-12-06 11:53:03 +000068Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000069{
Michele Di Giorgioa602f032020-03-12 19:34:33 +000070 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Michele Di Giorgio47a89902020-03-09 19:32:33 +000071 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000072
Michele Di Giorgioa602f032020-03-12 19:34:33 +000073 if(output->total_size() > 0)
74 {
75 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
76 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
77 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +000078 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000079}
80
Georgios Pinitas631c41a2017-12-06 11:53:03 +000081std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000082{
83 constexpr unsigned int num_elems_processed_per_iteration = 16;
84
Michele Di Giorgioa602f032020-03-12 19:34:33 +000085 // Output auto initialization if not yet initialized
86 auto_init_if_empty(*output, TensorShape(input->dimension(0)), 1, DataType::S32);
87
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000088 // Configure kernel window
89 Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
90
91 AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
92 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
93
94 bool window_changed = update_window_and_padding(win, input_access, output_access);
95
Diego Lopez Recasbcbc9702017-12-18 11:28:27 +000096 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000097
Georgios Pinitas631c41a2017-12-06 11:53:03 +000098 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000099 return std::make_pair(err, win);
100}
101} // namespace
102
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100103INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000104 : _input(), _output(), _k(0), _is_reshaped(false), _scalar(0), _mul_by_scalar(false)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100105{
106}
107
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000108void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100109{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000110 // Perform validate step
111 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
112 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100113
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000114 _input = mtx_a;
115 _output = vector_sum_row;
116 _k = info.k;
117 _is_reshaped = info.is_reshaped;
118 _scalar = info.scalar;
119 _mul_by_scalar = info.mul_by_scalar;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100120
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100121 // Configure kernel window
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000122 auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
123 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
124 INEKernel::configure(win_config.second);
125}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100126
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000127Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000128{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000129 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000130 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), info.is_reshaped).first);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100131
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000132 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100133}
134
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100135template <typename T>
136void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100137{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100138 // Intermediate and final accumulator types
139 using TIAcc = wrapper::traits::promote_t<T>;
140 using TAcc = wrapper::traits::promote_t<TIAcc>;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100141
142 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
143
144 Window win_input(collapsed_window);
145 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
146 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
147 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
148
149 Iterator in(_input, win_input);
150 Iterator out(_output, collapsed_window);
151
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000152 const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
153
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100154 if(_is_reshaped)
155 {
156 execute_window_loop(collapsed_window, [&](const Coordinates & id)
157 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100158 auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100159
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100160 const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100161
162#if __arm__
163 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
164#endif /* __arm__ */
165
166 int i = 0;
167 // This for loop performs 4 accumulations
168 for(; i <= (_k - 4); i += 4)
169 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100170 const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100171
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100172 // Convert 8-bit to 16-bit
173 typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100174 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100175 wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
176 wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
177 wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
178 wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100179 };
180
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100181 // Accumulate to 16-bit
182 a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
183 a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
184 a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100185
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100186 // Accumulate to 32-bit
187 sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100188 }
189
190 // This for loop performs the leftover accumulations
191 for(; i < _k; ++i)
192 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100193 const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100194
195 // Convert U8 to U16
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100196 const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100197
198 // Accumulate to U32
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100199 sum_row = wrapper::vaddw(sum_row, a0_d16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100200 }
201
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000202 // Multiply by scalar if necessary
203 if(_mul_by_scalar)
204 {
205 sum_row = wrapper::vmul(sum_row, vec_scalar);
206 }
207
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100208 auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
209
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000210 wrapper::vstore(vector_sum_row, wrapper::vreinterpret(sum_row));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100211 },
212 in, out);
213 }
214 else // it is not reshaped
215 {
216 execute_window_loop(collapsed_window, [&](const Coordinates & id)
217 {
218 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100219 auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
220 TAcc sum_row = 0;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100221
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100222 const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100223
224#if __arm__
225 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
226#endif /* __arm__ */
227
228 int i = 0;
229 // This for loop performs 16 accumulations
230 for(; i <= (_k - 16); i += 16)
231 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100232 const auto a0_d8 = wrapper::vloadq(matrix_a + i);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100233
234 // Partial accumulations in U16
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100235 const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100236
237 // Accumulate to U32
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100238 vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100239 }
240
241 // This for loop performs the leftover accumulations
242 for(; i < _k; ++i)
243 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100244 sum_row += static_cast<TAcc>(matrix_a[i]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100245 }
246
247#if defined(__aarch64__)
248 // Reduction operation available on 64 bit architectures only
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100249 sum_row += wrapper::vaddv(vsum_row);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100250#else // __aarch64__
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100251 auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
252 tmp = wrapper::vpadd(tmp, tmp);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100253
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100254 sum_row += wrapper::vgetlane(tmp, 0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100255#endif // __aarch64__
256
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000257 // Multiply by scalar if necessary
258 if(_mul_by_scalar)
259 {
260 sum_row *= _scalar;
261 }
262
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100263 *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100264 },
265 in, out);
266 }
267}
268
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100269void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
270{
271 ARM_COMPUTE_UNUSED(info);
272 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
273 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
274
275 switch(_input->info()->data_type())
276 {
277 case DataType::QASYMM8:
278 run_internal<uint8_t>(window);
279 break;
280 case DataType::QASYMM8_SIGNED:
Michele Di Giorgio47a89902020-03-09 19:32:33 +0000281 case DataType::QSYMM8:
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100282 case DataType::QSYMM8_PER_CHANNEL:
283 run_internal<int8_t>(window);
284 break;
285 default:
286 ARM_COMPUTE_ERROR("Unsupported data type");
287 }
288}
289
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000290void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100291{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000292 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
293 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100294
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000295 _input = mtx_b;
296 _output = vector_sum_col;
297 _k = info.k;
298 _is_reshaped = info.is_reshaped;
299 _scalar = info.scalar;
300 _mul_by_scalar = info.mul_by_scalar;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100301
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100302 // Configure kernel window
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000303 auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
304 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
305 INEKernel::configure(win_config.second);
306}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100307
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000308Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000309{
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000310 ARM_COMPUTE_UNUSED(info);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000311 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
312 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100313
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000314 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100315}
316
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100317template <typename T>
318void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100319{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100320 // Intermediate and final accumulator types
321 using TIAcc = wrapper::traits::promote_t<T>;
322 using TAcc = wrapper::traits::promote_t<TIAcc>;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100323
324 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
325
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000326 const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
327
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100328 if(_is_reshaped)
329 {
330 Window win_input(collapsed_window);
331 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
332 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
333 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
334
335 Iterator in(_input, win_input);
336 Iterator out(_output, collapsed_window);
337
338 execute_window_loop(collapsed_window, [&](const Coordinates & id)
339 {
340 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100341 typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100342 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100343 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
344 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
345 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
346 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100347 };
348
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100349 const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100350
351#if __arm__
352 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
353#endif /* __arm__ */
354
355 int i = 0;
356 for(; i < _k; ++i)
357 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100358 const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100359
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100360 // Convert 8bit to 16bit
361 const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100362 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100363 wrapper::vmovl(wrapper::vgetlow(b0_b8)),
364 wrapper::vmovl(wrapper::vgethigh(b0_b8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100365 };
366
367 // Accumulate to U32
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100368 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
369 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
370 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
371 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100372 }
373
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000374 // Multiply by scalar if necessary
375 if(_mul_by_scalar)
376 {
377 sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
378 sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
379 sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
380 sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
381 }
382
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100383 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
384
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000385 wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
386 wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
387 wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
388 wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100389 },
390 in, out);
391 }
392 else // it is not reshaped
393 {
394 const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
395 const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
396
397 // The implementation computes 16 elements per iteration
398 const int window_start_x = 16 * info.thread_id;
399 const int window_step_x = 16 * info.num_threads;
400 // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
401 const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
402
403 Window win_out(collapsed_window);
404 win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
405
406 Window win_in(win_out);
407 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
408 win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
409
410 Iterator inb(_input, win_in);
411 Iterator out(_output, win_out);
412
413 execute_window_loop(win_out, [&](const Coordinates & id)
414 {
415 if(id.x() > width_matrix_b)
416 {
417 return;
418 }
419
420 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100421 typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100422 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100423 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
424 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
425 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
426 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100427 };
428
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100429 const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100430
431#if __arm__
432 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
433 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
434#endif /* __arm__ */
435
436 int i = 0;
437 // This for loop performs 4 accumulations
438 for(; i <= (_k - 4); i += 4)
439 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100440 const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
441 const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
442 const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
443 const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100444
445#if __arm__
446 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
447 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
448 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
449 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
450#endif /* __arm__ */
451
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100452 // Partial accumulation in 16bit
453 typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100454 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100455 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
456 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100457 };
458
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100459 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
460 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
461 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
462 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
463 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
464 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
465 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
466 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100467
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100468 // Accumulate to 32bit
469 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
470 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
471 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
472 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100473
474 matrix_b += 4 * in_b_stride;
475 }
476
477 // This for loop perfoms the leftover accumulations
478 for(; i < _k; ++i)
479 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100480 const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100481
Pablo Tello6ff12a02017-11-02 16:09:35 +0000482 // Convert S8 to S16
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100483 const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100484 {
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100485 wrapper::vmovl(wrapper::vgetlow(b0_b8)),
486 wrapper::vmovl(wrapper::vgethigh(b0_b8))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100487 };
488
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100489 // Accumulate to 32bit
490 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
491 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
492 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
493 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100494
495 matrix_b += in_b_stride;
496 }
497
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000498 // Multiply by scalar if necessary
499 if(_mul_by_scalar)
500 {
501 sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
502 sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
503 sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
504 sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
505 }
506
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100507 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
508
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000509 wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
510 wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
511 wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
512 wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100513 },
514 inb, out);
515 }
Pablo Tello6ff12a02017-11-02 16:09:35 +0000516}
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100517
518void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
519{
520 ARM_COMPUTE_UNUSED(info);
521 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
522 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
523
524 switch(_input->info()->data_type())
525 {
526 case DataType::QASYMM8:
527 run_internal<uint8_t>(window, info);
528 break;
529 case DataType::QASYMM8_SIGNED:
Michele Di Giorgio47a89902020-03-09 19:32:33 +0000530 case DataType::QSYMM8:
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100531 case DataType::QSYMM8_PER_CHANNEL:
532 run_internal<int8_t>(window, info);
533 break;
534 default:
535 ARM_COMPUTE_ERROR("Unsupported data type");
536 }
537}
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000538} // namespace arm_compute