blob: 94e86c6077487fcd9bbb5c09a5021dcdf2d18026 [file] [log] [blame]
Manuel Bottinicfac51c2021-06-18 15:47:28 +01001/*
Radu Salavatf1f1f872024-02-27 18:32:26 +00002 * Copyright (c) 2021-2024 Arm Limited.
Manuel Bottinicfac51c2021-06-18 15:47:28 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010025
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/KernelDescriptors.h"
30#include "arm_compute/core/Types.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010031#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010032#include "arm_compute/core/Validate.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010033#include "arm_compute/runtime/NEON/NEScheduler.h"
34#include "arm_compute/runtime/TensorAllocator.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010035
ramelg013ae3d882021-09-12 23:07:47 +010036#include "src/common/utils/Log.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010037#include "src/core/helpers/AutoConfiguration.h"
38#include "src/core/helpers/MemoryHelpers.h"
Georgios Pinitas7891a732021-08-20 21:39:25 +010039#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
40#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
41#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
42#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
43#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
44#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
45#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
46#include "src/cpu/operators/CpuActivation.h"
47#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
48#include "src/cpu/utils/CpuAuxTensorHandler.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010049
50using namespace arm_compute::misc::shape_calculator;
51using namespace arm_compute::experimental;
52
53namespace arm_compute
54{
55namespace cpu
56{
57namespace
58{
59cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
60{
61 cpu::AsmGemmInfo asm_info;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010062 asm_info.method = cpu::AsmConvMethod::Im2Col;
63 asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
64 asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
65 asm_info.activation_info = info.activation_info();
66 asm_info.output_stage = info.gemmlowp_output_stage();
67 asm_info.fast_mode = info.fast_math();
Radu Salavatf1f1f872024-02-27 18:32:26 +000068 asm_info.accumulate = info.accumulate();
Manuel Bottinicfac51c2021-06-18 15:47:28 +010069
70 return asm_info;
71}
72} // namespace
73
74CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
75 : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
76 _mm_kernel(),
77 _mtx_a_reshape_kernel(),
78 _mtx_b_reshape_kernel(),
79 _mtx_a_reduction_kernel(),
80 _mtx_b_reduction_kernel(),
81 _offset_contribution_kernel(),
82 _offset_contribution_output_stage_kernel(),
83 _activation_func(),
84 _convert_to_signed_asymm(),
85 _convert_from_signed_asymm(),
86 _vector_sum_col(),
87 _vector_sum_row(),
88 _tmp_a(),
89 _tmp_b(),
90 _mm_result_s32(),
91 _signed_a(),
92 _signed_output(),
93 _a_offset(0),
94 _b_offset(0),
95 _run_vector_matrix_multiplication(false),
96 _assembly_path(false),
97 _fused_assembly_path(false),
98 _reshape_b_only_on_first_run(false),
99 _is_prepared(false),
100 _fuse_output_stage(false),
101 _run_activation(false),
102 _flip_signedness(false),
103 _gemm_info(),
104 _aux_mem(Count)
105{
106}
107CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
108
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100109void CpuGemmLowpMatrixMultiplyCore::configure(
110 const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100111{
112 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
113 ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
ramelg013ae3d882021-09-12 23:07:47 +0100114 ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100115
116 const ITensorInfo *matrix_a = a;
117 const ITensorInfo *matrix_b = b;
118 GEMMInfo info = gemm_info;
119
120 // Set internal variables
121 _a_offset = a->quantization_info().uniform().offset;
122 _b_offset = b->quantization_info().uniform().offset;
123 _run_vector_matrix_multiplication = a->dimension(1) < 2;
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100124 _reshape_b_only_on_first_run = b->are_values_constant();
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100125 _is_prepared = false;
126 _fused_assembly_path = false;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100127 _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
128 _reshape_b_only_on_first_run;
129 _gemm_info = gemm_info;
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100130
131 _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
132
133 const ITensorInfo *a_to_use = a;
134
135 // Convert to QASYMM8 -> QASYMM8_SIGNED and back
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100136 if (_flip_signedness)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100137 {
138 const int32_t offset_correction = 128;
139 const DataType dt = DataType::QASYMM8_SIGNED;
140 const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
141
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100142 _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
143 QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100144 _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
145 _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
146 a_to_use = &_signed_a;
147 _a_offset = _signed_a.quantization_info().uniform().offset;
148
149 const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100150 _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(
151 QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100152
153 // Output stage correction
154 GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
155 output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;
156 output_stage_corr.gemmlowp_min_bound -= offset_correction;
157 output_stage_corr.gemmlowp_max_bound -= offset_correction;
158 info.set_gemmlowp_output_stage(output_stage_corr);
159
160 // Update matrix a
161 matrix_a = &_signed_a;
162 }
163
164 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100165 if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100166 {
167 _fuse_output_stage = true;
168 _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
169 }
170
171 // Initialize assembly kernel meta-data
172 const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
173#ifdef __aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100174 if (!(!b->are_values_constant() &&
175 b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100176 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100177 switch (a->data_type())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100178 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100179 case DataType::QASYMM8:
180 case DataType::QASYMM8_SIGNED:
181 case DataType::U8:
182 case DataType::S8:
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100183 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100184 if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
185 info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100186 {
187 auto c_info_to_use = c == nullptr ? nullptr : c;
188 _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
189 _fused_assembly_path = _asm_glue->is_configured();
190 }
191 else
192 {
193 auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
194 _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
195 }
196 _assembly_path = _asm_glue->is_configured();
197 break;
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100198 }
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100199 default:
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100200 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100201 ARM_COMPUTE_ERROR("Datatype not supported");
202 break;
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100203 }
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100204 }
205 }
206#endif /* __aarch64__ */
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100207 if (!(_assembly_path || _run_vector_matrix_multiplication))
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100208 {
209 matrix_a = &_tmp_a;
210 matrix_b = &_tmp_b;
211
212 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100213 _tmp_a =
214 TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100215 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
216 _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
217
218 // Configure interleave kernel
219 _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
220 _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
221
222 // Configure transpose kernel
223 _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
224 _mtx_b_reshape_kernel->configure(b, &_tmp_b);
225 }
226
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100227 if (!_fused_assembly_path)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100228 {
229 // Build reduction info
230 const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
231
232 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100233 if (_a_offset != 0)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100234 {
235 _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
236
237 // Configure Matrix B reduction kernel
238 _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
239 _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
240 }
241
242 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100243 if (_b_offset != 0)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100244 {
245 _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
246
247 // Configure matrix A reduction kernel
248 _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
249 _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
250 }
251
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100252 if (_fuse_output_stage)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100253 {
254 // Configure matrix multiply kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100255 if (!_assembly_path)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100256 {
257 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
258 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
259 }
260
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100261 _offset_contribution_output_stage_kernel =
262 std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
263 _offset_contribution_output_stage_kernel->configure(
264 &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
265 _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,
266 a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100267
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100268 if (_flip_signedness)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100269 {
270 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
271 _convert_from_signed_asymm->configure(&_signed_output, dst);
272 }
273 }
274 else
275 {
276 // Configure matrix multiply kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100277 if (!_assembly_path)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100278 {
279 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
280 _mm_kernel->configure(matrix_a, matrix_b, dst);
281 }
282 // Configure offset contribution kernel
283 _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100284 _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,
285 _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100286 _a_offset, _b_offset);
287 }
288 }
289 // Configure activation
290 const ActivationLayerInfo &activation = gemm_info.activation_info();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100291 _run_activation =
292 activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
293 if (_run_activation)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100294 {
295 _activation_func = std::make_unique<CpuActivation>();
296 _activation_func->configure(dst, nullptr, activation);
297 }
298
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100299 if (_assembly_path)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100300 {
SiCong Lic5ab4df2023-10-17 17:38:57 +0100301 const auto asm_mem_req = _asm_glue->workspace();
302 for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
303 {
304 _aux_mem[slot] = asm_mem_req[slot];
305 }
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100306 }
307
308 // Request memory for LHS and RHS reshape matrix
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100309 _aux_mem[VectorSumCol] =
310 MemoryInfo(offset_int_vec(VectorSumCol),
311 !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
312 : MemoryLifetime::Temporary,
313 _vector_sum_col.total_size());
314 _aux_mem[VectorSumRow] =
315 MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
316 _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
317 _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
318 _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
319 _tmp_b.total_size());
320 _aux_mem[MMResultS32] =
321 MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
322 _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
323 _aux_mem[SignedOutput] =
324 MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100325}
326
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100327Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
328 const ITensorInfo *b,
329 const ITensorInfo *c,
330 const ITensorInfo *output,
331 const GEMMInfo &gemm_info)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100332{
333 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100334 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
335 DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
336 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
337 DataType::QASYMM8_SIGNED);
338 ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&
339 gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
340 "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
341 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
342 (a)->dimension(0) != (b)->dimension(1),
343 "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100344 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
345 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
346
Radu Salavatf1f1f872024-02-27 18:32:26 +0000347 // When using accumulation(in place summation), for now, the only supported DataType for output is S32.
348 if (gemm_info.accumulate())
349 {
350 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE,
351 "Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED");
352 }
353
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100354 GEMMInfo info = gemm_info;
355 const ITensorInfo *matrix_a_info = a;
356 const ITensorInfo *matrix_b_info = b;
357
358 const ITensorInfo *a_to_use = a;
359
360 TensorInfo tmp_a_info{};
361 TensorInfo tmp_b_info{};
362 TensorInfo mm_result_s32_info{};
363
364 int32_t a_offset = a->quantization_info().uniform().offset;
365 int32_t b_offset = b->quantization_info().uniform().offset;
366
367 bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100368 if (fuse_output_stage)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100369 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100370 auto_init_if_empty(mm_result_s32_info,
371 a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100372 }
373
374 // Convert QASYMM8->QASYMM8_SIGNED
375 TensorInfo signed_a{};
376 TensorInfo signed_output{};
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100377 bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
378 (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
379 if (flip_signedness)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100380 {
381 const int32_t offset_correction = 128;
382 const DataType dt = DataType::QASYMM8_SIGNED;
383 const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
384
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100385 signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
386 QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100387 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
388 a_to_use = &signed_a;
389 a_offset = signed_a.quantization_info().uniform().offset;
390
391 const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100392 signed_output = output->clone()->set_data_type(dt).set_quantization_info(
393 QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100394
395 // Output stage correction
396 GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
397 output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
398 output_stage_corr.gemmlowp_min_bound -= offset_correction;
399 output_stage_corr.gemmlowp_max_bound -= offset_correction;
400 info.set_gemmlowp_output_stage(output_stage_corr);
401
402 // Update matrix a
403 matrix_a_info = &signed_a;
404 }
405
406 // Initialize assembly kernel meta-data
407 const AsmGemmInfo asm_info = init_assembly_metadata(info);
408
409 // Check if we need to run the optimized assembly kernel
410 bool run_optimised = false;
411 bool run_optimised_requantized = false;
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100412
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100413 if (!(!b->are_values_constant() &&
414 b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100415 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100416 if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
417 info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100418 {
419 run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
420 run_optimised_requantized = run_optimised;
421 }
422 else
423 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100424 run_optimised = bool(CpuGemmAssemblyDispatch::validate(
425 a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100426 }
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100427 }
428
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100429 if (run_optimised)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100430 {
431 ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100432 if (info.depth_output_gemm3d() != 0)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100433 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100434 if (info.reinterpret_input_as_3d())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100435 {
436 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
437 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
438 }
439 else
440 {
441 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
442 }
443 }
444 else
445 {
446 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
447 }
448 }
449 else
450 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100451 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
452 "NEGEMM cannot reinterpret the input tensor as 3D");
453 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
454 "NEGEMM cannot reinterpret the output tensor as 3D");
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100455
456 const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100457 if (!run_vector_matrix_multiplication)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100458 {
459 matrix_a_info = &tmp_a_info;
460 matrix_b_info = &tmp_b_info;
461
462 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
463 TensorShape shape_tmp_a = a->tensor_shape();
464 shape_tmp_a.set(0, a->dimension(0) * 4);
465 shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
466
467 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
468 TensorShape shape_tmp_b = b->tensor_shape();
469 shape_tmp_b.set(0, b->dimension(1) * 16);
470 shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
471
472 // Validate interleave kernel
473 auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
474 auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
475
476 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
477 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
478 }
479 }
480
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100481 if (!run_optimised_requantized)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100482 {
483 TensorInfo info_vector_sum_col{};
484 TensorInfo info_vector_sum_row{};
485
486 const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
487
488 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100489 if (a_offset != 0)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100490 {
491 info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
492
493 // Configure Matrix B reduction kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100494 ARM_COMPUTE_RETURN_ON_ERROR(
495 kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100496 }
497
498 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100499 if (b_offset != 0)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100500 {
501 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
502
503 // Configure matrix A reduction kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100504 ARM_COMPUTE_RETURN_ON_ERROR(
505 kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100506 }
507
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100508 if (fuse_output_stage)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100509 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100510 if (!run_optimised)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100511 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100512 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
513 info.reinterpret_input_as_3d(),
514 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
515 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
516 info.depth_output_gemm3d() != 0,
517 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100518
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100519 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
520 matrix_a_info, matrix_b_info, &mm_result_s32_info));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100521 }
522
523 // Validate offset contribution kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100524 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
525 &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
526 b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
527 b_offset, info.gemmlowp_output_stage()));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100528 }
529 else
530 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100531 if (!run_optimised)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100532 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100533 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
534 info.reinterpret_input_as_3d(),
535 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
536 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
537 info.depth_output_gemm3d() != 0,
538 "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100539
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100540 ARM_COMPUTE_RETURN_ON_ERROR(
541 kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100542 }
543 // Validate offset contribution kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100544 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
545 output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
546 a_offset, b_offset));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100547 }
548 }
549
550 // Validate activation
551 const ActivationLayerInfo &activation = gemm_info.activation_info();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100552 if (activation.enabled())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100553 {
554 ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
555 }
556
557 return Status{};
558}
559
560void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
561{
562 prepare(tensors);
Georgios Pinitas22f5ed52021-07-23 18:58:43 +0100563
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100564 auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
565 auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
566 auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
567 auto dst = tensors.get_tensor(TensorType::ACL_DST);
568 auto a_to_use = a;
569 auto matrix_a = a;
570 auto matrix_b = b;
571
572 CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
573 CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
574 CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
575 CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
576 CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
577 CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
578 CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
579
580 // Convert QASYMM8->QASYMM8_SIGNED
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100581 if (_flip_signedness)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100582 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100583 ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
584 NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
585 pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100586 a_to_use = signed_a.get();
Georgios Pinitasd4a5bc52021-08-12 07:42:51 +0100587 matrix_a = signed_a.get();
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100588 }
589
590 // Run GEMM
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100591 if (_asm_glue->is_configured())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100592 {
593 ITensorPack asm_glue_tensors = tensors;
594 auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100595 if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
596 _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100597 {
598 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
599 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
600 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
601 asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
602 }
603 else
604 {
605 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
606 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
607 asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
608 }
609 _asm_glue->run(asm_glue_tensors);
610 }
611 else
612 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100613 if (!_run_vector_matrix_multiplication)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100614 {
615 matrix_a = tmp_a.get();
616 matrix_b = tmp_b.get();
617 // Run interleave kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100618 ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
619 NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
620 pack_a);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100621
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100622 if (!_reshape_b_only_on_first_run)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100623 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100624 ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100625 // Run transpose kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100626 NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
627 _mtx_b_reshape_kernel->window(), pack_b);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100628 }
629 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100630 ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
631 if (_fuse_output_stage)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100632 {
633 pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
634 }
635 else
636 {
637 pack_mm.add_tensor(TensorType::ACL_DST, dst);
638 }
639 NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
640 }
641
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100642 if (!_fused_assembly_path)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100643 {
644 // Run matrix A reduction kernel only if _b_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100645 if (_b_offset != 0)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100646 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100647 ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
648 NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
649 _mtx_a_reduction_kernel->window(), pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100650 }
651
652 // Run matrix B reduction kernel only if _a_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100653 if (_a_offset != 0 && !_reshape_b_only_on_first_run)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100654 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100655 ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
656 NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
657 _mtx_b_reduction_kernel->window(), pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100658 }
659
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100660 if (_fuse_output_stage)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100661 {
662 ITensorPack pack;
663 pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
664 pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
665 pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
666 pack.add_tensor(TensorType::ACL_SRC_3, c);
667 pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
668
669 // Run offset contribution kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100670 NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
671 _offset_contribution_output_stage_kernel->window(), pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100672 }
673 else
674 {
675 ITensorPack pack;
676 pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
677 pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
678 pack.add_tensor(TensorType::ACL_DST, dst);
679
680 // Run offset contribution kernel
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100681 NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
682 _offset_contribution_kernel->window(), pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100683 }
684 }
685
686 // Convert QASYMM8_SIGNED->QASYMM8
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100687 if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100688 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100689 ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
690 NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
691 _convert_from_signed_asymm->window(), pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100692 }
693
694 // Run fused activation unless already run in the fused assembly
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100695 if (_run_activation)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100696 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100697 ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100698 _activation_func->run(pack);
699 }
700}
701
702void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
703{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100704 if (!_is_prepared)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100705 {
706 auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
707 // Run assembly reshape
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100708 if (_asm_glue->is_configured())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100709 {
710 _asm_glue->prepare(tensors);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100711 }
712 // Run non-assembly reshape
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100713 else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100714 {
715 // Run reshape kernel and mark original weights tensor as unused
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100716 ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100717 CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100718 ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
719 NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
720 pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100721 }
722
723 // Run matrix B reduction kernel only if _a_offset is not equal to 0
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100724 if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100725 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100726 ITensor *vector_sum_col_p =
727 utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100728 CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100729 ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
730 NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
731 _mtx_b_reduction_kernel->window(), pack);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100732 }
733 _is_prepared = true;
734 }
735}
736experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
737{
738 return _aux_mem;
739}
740} // namespace cpu
741} // namespace arm_compute