blob: 8ca128fb07f9d00e14216fd0b6d6f9bf2e0cbcac [file] [log] [blame]
Manuel Bottinicfac51c2021-06-18 15:47:28 +01001/*
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +01002 * Copyright (c) 2021-2023 Arm Limited.
Manuel Bottinicfac51c2021-06-18 15:47:28 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010025
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/KernelDescriptors.h"
30#include "arm_compute/core/Types.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/core/utils/misc/ShapeCalculator.h"
33#include "arm_compute/runtime/NEON/NEScheduler.h"
34#include "arm_compute/runtime/TensorAllocator.h"
35#include "src/core/helpers/AutoConfiguration.h"
36#include "src/core/helpers/MemoryHelpers.h"
37
ramelg013ae3d882021-09-12 23:07:47 +010038#include "src/common/utils/Log.h"
Georgios Pinitas7891a732021-08-20 21:39:25 +010039#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
40#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
41#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
42#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
43#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
44#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
45#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
46#include "src/cpu/operators/CpuActivation.h"
47#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
48#include "src/cpu/utils/CpuAuxTensorHandler.h"
Manuel Bottinicfac51c2021-06-18 15:47:28 +010049
50using namespace arm_compute::misc::shape_calculator;
51using namespace arm_compute::experimental;
52
53namespace arm_compute
54{
55namespace cpu
56{
57namespace
58{
59cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
60{
61 cpu::AsmGemmInfo asm_info;
Mohammed Suhail Munshi4b5f6ef2022-10-21 11:15:54 +010062 asm_info.method = cpu::AsmConvMethod::Im2Col;
63 asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
64 asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
65 asm_info.activation_info = info.activation_info();
66 asm_info.output_stage = info.gemmlowp_output_stage();
67 asm_info.fast_mode = info.fast_math();
Manuel Bottinicfac51c2021-06-18 15:47:28 +010068
69 return asm_info;
70}
71} // namespace
72
73CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
74 : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
75 _mm_kernel(),
76 _mtx_a_reshape_kernel(),
77 _mtx_b_reshape_kernel(),
78 _mtx_a_reduction_kernel(),
79 _mtx_b_reduction_kernel(),
80 _offset_contribution_kernel(),
81 _offset_contribution_output_stage_kernel(),
82 _activation_func(),
83 _convert_to_signed_asymm(),
84 _convert_from_signed_asymm(),
85 _vector_sum_col(),
86 _vector_sum_row(),
87 _tmp_a(),
88 _tmp_b(),
89 _mm_result_s32(),
90 _signed_a(),
91 _signed_output(),
92 _a_offset(0),
93 _b_offset(0),
94 _run_vector_matrix_multiplication(false),
95 _assembly_path(false),
96 _fused_assembly_path(false),
97 _reshape_b_only_on_first_run(false),
98 _is_prepared(false),
99 _fuse_output_stage(false),
100 _run_activation(false),
101 _flip_signedness(false),
102 _gemm_info(),
103 _aux_mem(Count)
104{
105}
106CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
107
108void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
109{
110 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
111 ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
ramelg013ae3d882021-09-12 23:07:47 +0100112 ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100113
114 const ITensorInfo *matrix_a = a;
115 const ITensorInfo *matrix_b = b;
116 GEMMInfo info = gemm_info;
117
118 // Set internal variables
119 _a_offset = a->quantization_info().uniform().offset;
120 _b_offset = b->quantization_info().uniform().offset;
121 _run_vector_matrix_multiplication = a->dimension(1) < 2;
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100122 _reshape_b_only_on_first_run = b->are_values_constant();
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100123 _is_prepared = false;
124 _fused_assembly_path = false;
125 _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
126 _gemm_info = gemm_info;
127
128 _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
129
130 const ITensorInfo *a_to_use = a;
131
132 // Convert to QASYMM8 -> QASYMM8_SIGNED and back
133 if(_flip_signedness)
134 {
135 const int32_t offset_correction = 128;
136 const DataType dt = DataType::QASYMM8_SIGNED;
137 const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
138
139 _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
140 _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
141 _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
142 a_to_use = &_signed_a;
143 _a_offset = _signed_a.quantization_info().uniform().offset;
144
145 const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
146 _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
147
148 // Output stage correction
149 GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
150 output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;
151 output_stage_corr.gemmlowp_min_bound -= offset_correction;
152 output_stage_corr.gemmlowp_max_bound -= offset_correction;
153 info.set_gemmlowp_output_stage(output_stage_corr);
154
155 // Update matrix a
156 matrix_a = &_signed_a;
157 }
158
159 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
160 if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
161 {
162 _fuse_output_stage = true;
163 _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
164 }
165
166 // Initialize assembly kernel meta-data
167 const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
168#ifdef __aarch64__
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100169 if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100170 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100171 switch(a->data_type())
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100172 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100173 case DataType::QASYMM8:
174 case DataType::QASYMM8_SIGNED:
175 case DataType::U8:
176 case DataType::S8:
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100177 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100178 if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
179 {
180 auto c_info_to_use = c == nullptr ? nullptr : c;
181 _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
182 _fused_assembly_path = _asm_glue->is_configured();
183 }
184 else
185 {
186 auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
187 _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
188 }
189 _assembly_path = _asm_glue->is_configured();
190 break;
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100191 }
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100192 default:
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100193 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100194 ARM_COMPUTE_ERROR("Datatype not supported");
195 break;
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100196 }
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100197 }
198 }
199#endif /* __aarch64__ */
200 if(!(_assembly_path || _run_vector_matrix_multiplication))
201 {
202 matrix_a = &_tmp_a;
203 matrix_b = &_tmp_b;
204
205 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
206 _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
207 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
208 _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
209
210 // Configure interleave kernel
211 _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
212 _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
213
214 // Configure transpose kernel
215 _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
216 _mtx_b_reshape_kernel->configure(b, &_tmp_b);
217 }
218
219 if(!_fused_assembly_path)
220 {
221 // Build reduction info
222 const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
223
224 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
225 if(_a_offset != 0)
226 {
227 _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
228
229 // Configure Matrix B reduction kernel
230 _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
231 _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
232 }
233
234 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
235 if(_b_offset != 0)
236 {
237 _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
238
239 // Configure matrix A reduction kernel
240 _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
241 _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
242 }
243
244 if(_fuse_output_stage)
245 {
246 // Configure matrix multiply kernel
247 if(!_assembly_path)
248 {
249 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
250 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
251 }
252
253 _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
254 _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
255 _a_offset == 0 ? nullptr : &_vector_sum_col,
256 _b_offset == 0 ? nullptr : &_vector_sum_row, c,
257 _flip_signedness ? &_signed_output : dst,
258 a->dimension(0),
259 _a_offset, _b_offset, info.gemmlowp_output_stage());
260
261 if(_flip_signedness)
262 {
263 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
264 _convert_from_signed_asymm->configure(&_signed_output, dst);
265 }
266 }
267 else
268 {
269 // Configure matrix multiply kernel
270 if(!_assembly_path)
271 {
272 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
273 _mm_kernel->configure(matrix_a, matrix_b, dst);
274 }
275 // Configure offset contribution kernel
276 _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
277 _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
278 _a_offset, _b_offset);
279 }
280 }
281 // Configure activation
282 const ActivationLayerInfo &activation = gemm_info.activation_info();
283 _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
284 if(_run_activation)
285 {
286 _activation_func = std::make_unique<CpuActivation>();
287 _activation_func->configure(dst, nullptr, activation);
288 }
289
290 if(_assembly_path)
291 {
292 auto asm_mem_req = _asm_glue->workspace();
293 _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
294 _aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
295 }
296
297 // Request memory for LHS and RHS reshape matrix
298 _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
299 && _reshape_b_only_on_first_run ?
300 MemoryLifetime::Persistent :
301 MemoryLifetime::Temporary,
302 _vector_sum_col.total_size());
303 _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
304 _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
305 _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
306 _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
307 _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
308 _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
309}
310
311Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
312{
313 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
314 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
315 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
316 ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
317 ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
318 "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
319 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
320 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
321
322 GEMMInfo info = gemm_info;
323 const ITensorInfo *matrix_a_info = a;
324 const ITensorInfo *matrix_b_info = b;
325
326 const ITensorInfo *a_to_use = a;
327
328 TensorInfo tmp_a_info{};
329 TensorInfo tmp_b_info{};
330 TensorInfo mm_result_s32_info{};
331
332 int32_t a_offset = a->quantization_info().uniform().offset;
333 int32_t b_offset = b->quantization_info().uniform().offset;
334
335 bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
336 if(fuse_output_stage)
337 {
338 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
339 }
340
341 // Convert QASYMM8->QASYMM8_SIGNED
342 TensorInfo signed_a{};
343 TensorInfo signed_output{};
344 bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
345 if(flip_signedness)
346 {
347 const int32_t offset_correction = 128;
348 const DataType dt = DataType::QASYMM8_SIGNED;
349 const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
350
351 signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
352 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
353 a_to_use = &signed_a;
354 a_offset = signed_a.quantization_info().uniform().offset;
355
356 const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
357 signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
358
359 // Output stage correction
360 GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
361 output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
362 output_stage_corr.gemmlowp_min_bound -= offset_correction;
363 output_stage_corr.gemmlowp_max_bound -= offset_correction;
364 info.set_gemmlowp_output_stage(output_stage_corr);
365
366 // Update matrix a
367 matrix_a_info = &signed_a;
368 }
369
370 // Initialize assembly kernel meta-data
371 const AsmGemmInfo asm_info = init_assembly_metadata(info);
372
373 // Check if we need to run the optimized assembly kernel
374 bool run_optimised = false;
375 bool run_optimised_requantized = false;
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100376
377 if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100378 {
Viet-Hoa Do9b0a6b42023-04-03 16:27:25 +0100379 if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
380 {
381 run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
382 run_optimised_requantized = run_optimised;
383 }
384 else
385 {
386 run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
387 }
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100388 }
389
390 if(run_optimised)
391 {
392 ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
393 if(info.depth_output_gemm3d() != 0)
394 {
395 if(info.reinterpret_input_as_3d())
396 {
397 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
398 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
399 }
400 else
401 {
402 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
403 }
404 }
405 else
406 {
407 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
408 }
409 }
410 else
411 {
412 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
413 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
414
415 const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
416 if(!run_vector_matrix_multiplication)
417 {
418 matrix_a_info = &tmp_a_info;
419 matrix_b_info = &tmp_b_info;
420
421 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
422 TensorShape shape_tmp_a = a->tensor_shape();
423 shape_tmp_a.set(0, a->dimension(0) * 4);
424 shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
425
426 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
427 TensorShape shape_tmp_b = b->tensor_shape();
428 shape_tmp_b.set(0, b->dimension(1) * 16);
429 shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
430
431 // Validate interleave kernel
432 auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
433 auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
434
435 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
436 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
437 }
438 }
439
440 if(!run_optimised_requantized)
441 {
442 TensorInfo info_vector_sum_col{};
443 TensorInfo info_vector_sum_row{};
444
445 const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
446
447 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
448 if(a_offset != 0)
449 {
450 info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
451
452 // Configure Matrix B reduction kernel
453 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
454 }
455
456 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
457 if(b_offset != 0)
458 {
459 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
460
461 // Configure matrix A reduction kernel
462 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
463 }
464
465 if(fuse_output_stage)
466 {
467 if(!run_optimised)
468 {
469 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
470 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
471
472 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
473 }
474
475 // Validate offset contribution kernel
476 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
477 a_offset == 0 ? nullptr : &info_vector_sum_col,
478 b_offset == 0 ? nullptr : &info_vector_sum_row,
479 c,
480 flip_signedness ? &signed_output : output,
481 a_offset, b_offset,
482 info.gemmlowp_output_stage()));
483 }
484 else
485 {
486 if(!run_optimised)
487 {
488 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
489 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
490
491 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
492 }
493 // Validate offset contribution kernel
494 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
495 a_offset == 0 ? nullptr : &info_vector_sum_col,
496 b_offset == 0 ? nullptr : &info_vector_sum_row,
497 a_offset, b_offset));
498 }
499 }
500
501 // Validate activation
502 const ActivationLayerInfo &activation = gemm_info.activation_info();
503 if(activation.enabled())
504 {
505 ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
506 }
507
508 return Status{};
509}
510
511void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
512{
513 prepare(tensors);
Georgios Pinitas22f5ed52021-07-23 18:58:43 +0100514
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100515 auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
516 auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
517 auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
518 auto dst = tensors.get_tensor(TensorType::ACL_DST);
519 auto a_to_use = a;
520 auto matrix_a = a;
521 auto matrix_b = b;
522
523 CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
524 CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
525 CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
526 CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
527 CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
528 CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
529 CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
530
531 // Convert QASYMM8->QASYMM8_SIGNED
532 if(_flip_signedness)
533 {
534 ITensorPack pack =
535 {
536 { TensorType::ACL_SRC, a },
537 { TensorType::ACL_DST, signed_a.get() }
538 };
539 NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
540 a_to_use = signed_a.get();
Georgios Pinitasd4a5bc52021-08-12 07:42:51 +0100541 matrix_a = signed_a.get();
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100542 }
543
544 // Run GEMM
545 if(_asm_glue->is_configured())
546 {
547 ITensorPack asm_glue_tensors = tensors;
548 auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
549 if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
550 {
551 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
552 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
553 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
554 asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
555 }
556 else
557 {
558 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
559 asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
560 asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
561 }
562 _asm_glue->run(asm_glue_tensors);
563 }
564 else
565 {
566 if(!_run_vector_matrix_multiplication)
567 {
568 matrix_a = tmp_a.get();
569 matrix_b = tmp_b.get();
570 // Run interleave kernel
571 ITensorPack pack_a =
572 {
573 { TensorType::ACL_SRC, a_to_use },
574 { TensorType::ACL_DST, tmp_a.get() }
575 };
576 NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
577
578 if(!_reshape_b_only_on_first_run)
579 {
580 ITensorPack pack_b =
581 {
582 { TensorType::ACL_SRC, b },
583 { TensorType::ACL_DST, tmp_b.get() }
584 };
585 // Run transpose kernel
586 NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
587 }
588 }
589 ITensorPack pack_mm =
590 {
591 { TensorType::ACL_SRC_0, matrix_a },
592 { TensorType::ACL_SRC_1, matrix_b }
593 };
594 if(_fuse_output_stage)
595 {
596 pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
597 }
598 else
599 {
600 pack_mm.add_tensor(TensorType::ACL_DST, dst);
601 }
602 NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
603 }
604
605 if(!_fused_assembly_path)
606 {
607 // Run matrix A reduction kernel only if _b_offset is not equal to 0
608 if(_b_offset != 0)
609 {
610 ITensorPack pack =
611 {
612 { TensorType::ACL_SRC, a_to_use },
613 { TensorType::ACL_DST, vector_sum_row.get() }
614 };
615 NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
616 }
617
618 // Run matrix B reduction kernel only if _a_offset is not equal to 0
619 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
620 {
621 ITensorPack pack =
622 {
623 { TensorType::ACL_SRC, b },
624 { TensorType::ACL_DST, vector_sum_col.get() }
625 };
626 NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
627 }
628
629 if(_fuse_output_stage)
630 {
631 ITensorPack pack;
632 pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
633 pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
634 pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
635 pack.add_tensor(TensorType::ACL_SRC_3, c);
636 pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
637
638 // Run offset contribution kernel
639 NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
640 }
641 else
642 {
643 ITensorPack pack;
644 pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
645 pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
646 pack.add_tensor(TensorType::ACL_DST, dst);
647
648 // Run offset contribution kernel
649 NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
650 }
651 }
652
653 // Convert QASYMM8_SIGNED->QASYMM8
654 if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
655 {
656 ITensorPack pack =
657 {
658 { TensorType::ACL_SRC, signed_output.get() },
659 { TensorType::ACL_DST, dst }
660 };
661 NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
662 }
663
664 // Run fused activation unless already run in the fused assembly
665 if(_run_activation)
666 {
667 ITensorPack pack =
668 {
669 { TensorType::ACL_SRC, dst },
670 { TensorType::ACL_DST, dst }
671 };
672 _activation_func->run(pack);
673 }
674}
675
676void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
677{
678 if(!_is_prepared)
679 {
680 auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
681 // Run assembly reshape
682 if(_asm_glue->is_configured())
683 {
684 _asm_glue->prepare(tensors);
Manuel Bottinicfac51c2021-06-18 15:47:28 +0100685 }
686 // Run non-assembly reshape
687 else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
688 {
689 // Run reshape kernel and mark original weights tensor as unused
690 ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
691 CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
692 ITensorPack pack =
693 {
694 { TensorType::ACL_SRC, original_b },
695 { TensorType::ACL_DST, tmp_b.get() }
696 };
697 NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
698 }
699
700 // Run matrix B reduction kernel only if _a_offset is not equal to 0
701 if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
702 {
703 ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
704 CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
705 ITensorPack pack =
706 {
707 { TensorType::ACL_SRC, original_b },
708 { TensorType::ACL_DST, vector_sum_col.get() }
709 };
710 NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
711 }
712 _is_prepared = true;
713 }
714}
715experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
716{
717 return _aux_mem;
718}
719} // namespace cpu
720} // namespace arm_compute