blob: dfbfbd6fab34c596827949a2c5b5c3611140e3d1 [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
SiCongLib88272e2021-02-24 15:40:57 +00002 * Copyright (c) 2017-2021 Arm Limited.
Gian Marco Iodiceab182122017-10-09 15:05:40 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010025
Gian Marco Iodiceab182122017-10-09 15:05:40 +010026#include "arm_compute/core/ITensor.h"
Michele Di Giorgioa602f032020-03-12 19:34:33 +000027#include "arm_compute/core/KernelDescriptors.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010028#include "arm_compute/core/TensorInfo.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010030#include "src/core/helpers/AutoConfiguration.h"
31#include "src/core/helpers/WindowHelpers.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010032
Gian Marco Iodiceab182122017-10-09 15:05:40 +010033namespace arm_compute
34{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000035namespace
36{
Georgios Pinitas631c41a2017-12-06 11:53:03 +000037Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000038{
Michele Di Giorgioa602f032020-03-12 19:34:33 +000039 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +010040 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000041
Michele Di Giorgioa602f032020-03-12 19:34:33 +000042 if(output->total_size() > 0)
43 {
44 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
45 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
46 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +000047 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000048}
Georgios Pinitas631c41a2017-12-06 11:53:03 +000049Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000050{
Michele Di Giorgioa602f032020-03-12 19:34:33 +000051 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
Michele Di Giorgio47a89902020-03-09 19:32:33 +000052 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000053
Michele Di Giorgioa602f032020-03-12 19:34:33 +000054 if(output->total_size() > 0)
55 {
56 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
57 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
58 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +000059 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000060}
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000061} // namespace
62
Gian Marco Iodiceab182122017-10-09 15:05:40 +010063INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
morgolockc229e8c2020-09-25 12:03:21 +010064 : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010065{
66}
67
Michele Di Giorgioa602f032020-03-12 19:34:33 +000068void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010069{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000070 // Perform validate step
71 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
morgolockc229e8c2020-09-25 12:03:21 +010072 ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000073 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
Michele Di Giorgioa602f032020-03-12 19:34:33 +000074 _input = mtx_a;
75 _output = vector_sum_row;
76 _k = info.k;
Michele Di Giorgioa602f032020-03-12 19:34:33 +000077 _scalar = info.scalar;
78 _mul_by_scalar = info.mul_by_scalar;
Gian Marco Iodiceab182122017-10-09 15:05:40 +010079
morgolockc229e8c2020-09-25 12:03:21 +010080 // Output auto initialization if not yet initialized
81 auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(1)), 1, DataType::S32);
82
83 Window win = calculate_max_window(*_output->info(), Steps(1));
morgolockc229e8c2020-09-25 12:03:21 +010084
85 INEKernel::configure(win);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000086}
Gian Marco Iodiceab182122017-10-09 15:05:40 +010087
Michele Di Giorgioa602f032020-03-12 19:34:33 +000088Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000089{
morgolockc229e8c2020-09-25 12:03:21 +010090 ARM_COMPUTE_UNUSED(info);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000091 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
Georgios Pinitas631c41a2017-12-06 11:53:03 +000092 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +010093}
94
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +010095template <typename T>
96void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010097{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +010098 // Intermediate and final accumulator types
99 using TIAcc = wrapper::traits::promote_t<T>;
100 using TAcc = wrapper::traits::promote_t<TIAcc>;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100101
102 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
103
104 Window win_input(collapsed_window);
105 win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
106 win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
107 win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
108
109 Iterator in(_input, win_input);
110 Iterator out(_output, collapsed_window);
111
morgolockc229e8c2020-09-25 12:03:21 +0100112 execute_window_loop(collapsed_window, [&](const Coordinates & id)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100113 {
morgolockc229e8c2020-09-25 12:03:21 +0100114 auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
115 TAcc sum_row = 0;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100116
morgolockc229e8c2020-09-25 12:03:21 +0100117 const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100118
119#if __arm__
morgolockc229e8c2020-09-25 12:03:21 +0100120 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100121#endif /* __arm__ */
122
morgolockc229e8c2020-09-25 12:03:21 +0100123 int i = 0;
124 // This for loop performs 16 accumulations
125 for(; i <= (_k - 16); i += 16)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100126 {
morgolockc229e8c2020-09-25 12:03:21 +0100127 const auto a0_d8 = wrapper::vloadq(matrix_a + i);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100128
morgolockc229e8c2020-09-25 12:03:21 +0100129 // Partial accumulations in U16
130 const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100131
morgolockc229e8c2020-09-25 12:03:21 +0100132 // Accumulate to U32
133 vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
134 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100135
morgolockc229e8c2020-09-25 12:03:21 +0100136 // This for loop performs the leftover accumulations
137 for(; i < _k; ++i)
138 {
139 sum_row += static_cast<TAcc>(matrix_a[i]);
140 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100141
142#if defined(__aarch64__)
morgolockc229e8c2020-09-25 12:03:21 +0100143 // Reduction operation available on 64 bit architectures only
144 sum_row += wrapper::vaddv(vsum_row);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100145#else // __aarch64__
morgolockc229e8c2020-09-25 12:03:21 +0100146 auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
147 tmp = wrapper::vpadd(tmp, tmp);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100148
morgolockc229e8c2020-09-25 12:03:21 +0100149 sum_row += wrapper::vgetlane(tmp, 0);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100150#endif // __aarch64__
151
morgolockc229e8c2020-09-25 12:03:21 +0100152 // Multiply by scalar if necessary
153 if(_mul_by_scalar)
154 {
155 sum_row *= _scalar;
156 }
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000157
morgolockc229e8c2020-09-25 12:03:21 +0100158 *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
159 },
160 in, out);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100161}
162
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100163void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
164{
165 ARM_COMPUTE_UNUSED(info);
166 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
167 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
168
169 switch(_input->info()->data_type())
170 {
171 case DataType::QASYMM8:
172 run_internal<uint8_t>(window);
173 break;
174 case DataType::QASYMM8_SIGNED:
Michele Di Giorgio47a89902020-03-09 19:32:33 +0000175 case DataType::QSYMM8:
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100176 case DataType::QSYMM8_PER_CHANNEL:
177 run_internal<int8_t>(window);
178 break;
179 default:
180 ARM_COMPUTE_ERROR("Unsupported data type");
181 }
182}
183
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000184void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100185{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000186 ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
morgolockc229e8c2020-09-25 12:03:21 +0100187 ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
188
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000189 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100190
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000191 _input = mtx_b;
192 _output = vector_sum_col;
193 _k = info.k;
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000194 _scalar = info.scalar;
195 _mul_by_scalar = info.mul_by_scalar;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100196
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100197 // Configure kernel window
morgolockc229e8c2020-09-25 12:03:21 +0100198 constexpr unsigned int num_elems_processed_per_iteration = 16;
199
200 // Output auto initialization if not yet initialized
201 auto_init_if_empty(*_output->info(), TensorShape(_input->info()->dimension(0)), 1, DataType::S32);
202
203 // Configure kernel window
204 Window win = calculate_max_window_horizontal(*_output->info(), Steps(num_elems_processed_per_iteration));
morgolockc229e8c2020-09-25 12:03:21 +0100205 INEKernel::configure(win);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000206}
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100207
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000208Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000209{
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000210 ARM_COMPUTE_UNUSED(info);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000211 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100212
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000213 return Status{};
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100214}
215
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100216template <typename T>
217void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100218{
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100219 // Intermediate and final accumulator types
220 using TIAcc = wrapper::traits::promote_t<T>;
221 using TAcc = wrapper::traits::promote_t<TIAcc>;
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100222
morgolockc229e8c2020-09-25 12:03:21 +0100223 Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
224 const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{});
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100225
morgolockc229e8c2020-09-25 12:03:21 +0100226 const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
227 const auto in_b_stride = static_cast<int>(_input->info()->strides_in_bytes()[1]);
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000228
morgolockc229e8c2020-09-25 12:03:21 +0100229 // The implementation computes 16 elements per iteration
230 const int window_start_x = 16 * info.thread_id;
231 const int window_step_x = 16 * info.num_threads;
232 // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
233 const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
234
235 Window win_out(collapsed_window);
236 win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
237
238 Window win_in(win_out);
239 win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
240 win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
241
242 Iterator inb(_input, win_in);
243 Iterator out(_output, win_out);
244
245 execute_window_loop(win_out, [&](const Coordinates & id)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100246 {
morgolockc229e8c2020-09-25 12:03:21 +0100247 if(id.x() > width_matrix_b)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100248 {
morgolockc229e8c2020-09-25 12:03:21 +0100249 return;
250 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100251
morgolockc229e8c2020-09-25 12:03:21 +0100252 // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
253 typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
254 {
255 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
256 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
257 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
258 wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
259 };
260
261 const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100262
263#if __arm__
morgolockc229e8c2020-09-25 12:03:21 +0100264 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
265 asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100266#endif /* __arm__ */
267
morgolockc229e8c2020-09-25 12:03:21 +0100268 int i = 0;
269 // This for loop performs 4 accumulations
270 for(; i <= (_k - 4); i += 4)
271 {
272 const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
273 const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
274 const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
275 const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
276
277#if __arm__
278 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
279 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
280 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
281 asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
282#endif /* __arm__ */
283
284 // Partial accumulation in 16bit
285 typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100286 {
morgolockc229e8c2020-09-25 12:03:21 +0100287 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
288 wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
289 };
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100290
morgolockc229e8c2020-09-25 12:03:21 +0100291 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
292 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
293 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
294 tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
295 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
296 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
297 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
298 tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100299
morgolockc229e8c2020-09-25 12:03:21 +0100300 // Accumulate to 32bit
301 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
302 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
303 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
304 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100305
morgolockc229e8c2020-09-25 12:03:21 +0100306 matrix_b += 4 * in_b_stride;
307 }
308
309 // This for loop perfoms the leftover accumulations
310 for(; i < _k; ++i)
311 {
312 const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
313
314 // Convert S8 to S16
315 const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000316 {
morgolockc229e8c2020-09-25 12:03:21 +0100317 wrapper::vmovl(wrapper::vgetlow(b0_b8)),
318 wrapper::vmovl(wrapper::vgethigh(b0_b8))
319 };
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000320
morgolockc229e8c2020-09-25 12:03:21 +0100321 // Accumulate to 32bit
322 sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
323 sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
324 sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
325 sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100326
morgolockc229e8c2020-09-25 12:03:21 +0100327 matrix_b += in_b_stride;
328 }
329
330 // Multiply by scalar if necessary
331 if(_mul_by_scalar)
332 {
333 sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
334 sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
335 sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
336 sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
337 }
338
339 auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
340 if(id.x() + 16 < width_matrix_b)
341 {
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000342 wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
343 wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
344 wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
345 wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
morgolockc229e8c2020-09-25 12:03:21 +0100346 }
347 else
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100348 {
morgolockc229e8c2020-09-25 12:03:21 +0100349 auto left_over = width_matrix_b - id.x();
350 for(auto k = 0; k < 4 && left_over; ++k)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100351 {
morgolockc229e8c2020-09-25 12:03:21 +0100352 for(auto j = 0; j < 4 && left_over; ++j, --left_over)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100353 {
morgolockc229e8c2020-09-25 12:03:21 +0100354 *(vector_sum_col + k * 4 + j) = sum_col[k][j];
355 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100356 }
morgolockc229e8c2020-09-25 12:03:21 +0100357 }
358 },
359 inb, out);
Pablo Tello6ff12a02017-11-02 16:09:35 +0000360}
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100361
362void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
363{
364 ARM_COMPUTE_UNUSED(info);
365 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
366 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
367
368 switch(_input->info()->data_type())
369 {
370 case DataType::QASYMM8:
371 run_internal<uint8_t>(window, info);
372 break;
373 case DataType::QASYMM8_SIGNED:
Michele Di Giorgio47a89902020-03-09 19:32:33 +0000374 case DataType::QSYMM8:
Georgios Pinitasdbdea0d2019-10-16 19:21:40 +0100375 case DataType::QSYMM8_PER_CHANNEL:
376 run_internal<int8_t>(window, info);
377 break;
378 default:
379 ARM_COMPUTE_ERROR("Unsupported data type");
380 }
381}
Michele Di Giorgioa602f032020-03-12 19:34:33 +0000382} // namespace arm_compute