blob: f68ae9883faef897e8ebc44036743444ce6ab498 [file] [log] [blame]
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +00001/*
Renato Arantes36a75da2024-01-26 17:31:18 +00002 * Copyright (c) 2023-2024 Arm Limited.
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "src/cpu/operators/CpuMatMul.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010026
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000027#include "arm_compute/core/experimental/Types.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010028#include "arm_compute/core/Types.h"
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000029#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010030#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010031#include "arm_compute/core/Validate.h"
SiCong Li91295492023-07-21 18:16:13 +010032#include "arm_compute/function_info/MatMulInfo.h"
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000033#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010034#include "arm_compute/runtime/NEON/NEScheduler.h"
35
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000036#include "src/common/utils/Log.h"
37#include "src/core/CPP/Validate.h"
38#include "src/core/helpers/AutoConfiguration.h"
39#include "src/core/helpers/MemoryHelpers.h"
Viet-Hoa Doa62129a2023-04-26 15:38:45 +010040#include "src/core/utils/quantization/AsymmHelpers.h"
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000041#include "src/cpu/utils/CpuAuxTensorHandler.h"
42
43using namespace arm_compute::experimental;
44
45namespace arm_compute
46{
47namespace cpu
48{
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010049namespace
50{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010051Status get_gemmlowp_output_stage_info(const ITensorInfo *src,
52 const ITensorInfo *weights,
53 const ITensorInfo *dst,
54 const ActivationLayerInfo &act,
55 GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010056{
57 const auto data_type = src->data_type();
58 const QuantizationInfo oq_info = dst->quantization_info();
59 const UniformQuantizationInfo iq_unif = src->quantization_info().uniform();
60 const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform();
61 const UniformQuantizationInfo oq_unif = oq_info.uniform();
62
63 float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
64 int32_t output_multiplier;
65 int32_t output_shift;
66
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010067 ARM_COMPUTE_RETURN_ON_ERROR(
68 quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010069
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010070 int32_t type_min = 0;
71 int32_t type_max = 0;
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010072 std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
73
74 gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
75 gemmlowp_output_stage_info.gemmlowp_shift = output_shift;
76 gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset;
77 gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
Viet-Hoa Doa62129a2023-04-26 15:38:45 +010078 gemmlowp_output_stage_info.gemmlowp_min_bound = type_min;
79 gemmlowp_output_stage_info.gemmlowp_max_bound = type_max;
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010080
81 return Status{};
82}
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010083} // namespace
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010084
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000085CpuMatMul::CpuMatMul()
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010086 : _transpose_kernel_lhs(),
87 _transpose_kernel_rhs(),
88 _asm_glue(),
89 _lhs_transposed(),
90 _rhs_transposed(),
91 _original_lhs_shape(),
92 _original_rhs_shape(),
93 _original_dst_shape()
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000094{
95}
96
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010097Status CpuMatMul::validate(const ITensorInfo *lhs,
98 const ITensorInfo *rhs,
99 const ITensorInfo *dst,
100 const MatMulInfo &info,
101 const CpuMatMulSettings &settings,
102 const ActivationLayerInfo &act_info)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000103{
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100104 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
Renato Arantes36a75da2024-01-26 17:31:18 +0000105 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::BFLOAT16,
106 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000107 ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
108 ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
109 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
110 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(lhs);
111
112 const auto adj_lhs = info.adj_lhs();
113 const auto adj_rhs = info.adj_rhs();
114
115 const ITensorInfo *lhs_to_use = lhs;
116 const ITensorInfo *rhs_to_use = rhs;
117 TensorInfo lhs_transposed{};
118 TensorInfo rhs_transposed{};
119
120 auto gemm_info = AsmGemmInfo();
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100121 gemm_info.activation_info = act_info;
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000122 gemm_info.fast_mode = settings.fast_math();
Renato Arantes36a75da2024-01-26 17:31:18 +0000123 gemm_info.fixed_format = settings.fixed_format();
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000124
125 // Validate and then permute a/b
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100126 if (adj_lhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000127 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100128 auto_init_if_empty(lhs_transposed,
129 lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000130 ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
131 // Assign lhs_to_use pointer to use transposed TensorInfo
132 lhs_to_use = &lhs_transposed;
133 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100134 if (adj_rhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000135 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100136 auto_init_if_empty(rhs_transposed,
137 rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000138 ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
139 // Assign rhs_to_use pointer to use transposed TensorInfo
140 rhs_to_use = &rhs_transposed;
141 }
142
143 ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100144 "The product AB is defined only if the number of columns in A is equal to the "
145 "number of rows in B (after transpose)");
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000146
Viet-Hoa Do54e52a92023-05-02 16:20:58 +0100147 // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100148 for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000149 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100150 ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
151 "Broadcasting in Batch dimension is unsupported by this operator.");
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000152 }
153
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100154 // Quantized-specific configuration
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100155 if (is_data_type_quantized(lhs->data_type()))
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100156 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100157 ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
158 gemm_info.activation_info, gemm_info.output_stage));
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100159 }
160
Renato Arantes36a75da2024-01-26 17:31:18 +0000161 if (gemm_info.fixed_format)
162 {
163 gemm_info.weight_format = WeightFormat::ANY;
164 arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY;
165 ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, lhs_to_use,
166 rhs_to_use, nullptr, dst, gemm_info));
167 }
168
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000169 cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
170
171 return Status{};
172}
173
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100174void CpuMatMul::configure(ITensorInfo *lhs,
175 ITensorInfo *rhs,
176 ITensorInfo *dst,
177 const MatMulInfo &info,
178 const CpuMatMulSettings &settings,
179 const ActivationLayerInfo &act_info)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000180{
181 ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
182 ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
183 ARM_COMPUTE_ERROR_THROW_ON(CpuMatMul::validate(lhs, rhs, dst, info, settings));
184
185 _adj_lhs = info.adj_lhs();
186 _adj_rhs = info.adj_rhs();
187 _fast_math = settings.fast_math();
188
189 // 1. Create and reshape tensors
190 // ------------------------------------------------------
191 // a. Clone TensorInfo to prevent changing original tensor values during setup
192 // b. Change shape of lhs/dst to [x, y, 1, collapsed(z)] to match assembly kernel configuration
193 // c. For rhs collapse all dimensions larger than 3 to z dimension
194 TensorInfo lhs_to_use = *lhs->clone();
195 TensorInfo dst_to_use = *dst->clone();
196 TensorInfo rhs_to_use = *rhs->clone();
197
198 // Save starting shape of tensors
199 _original_lhs_shape = lhs_to_use.tensor_shape();
200 _original_dst_shape = dst_to_use.tensor_shape();
201 _original_rhs_shape = rhs_to_use.tensor_shape();
202
203 // Reshape lhs for use with assembly kernels.
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100204 lhs_to_use.set_tensor_shape(
205 TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
206 dst_to_use.set_tensor_shape(
207 TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000208 rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
209
210 // 2. Configuration for transpose of lhs/rhs
211 // ------------------------------------------------------
212 // Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100213 if (_adj_lhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000214 {
215 // Setup transpose LHS
216 _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
217 _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
218 }
219
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100220 if (_adj_rhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000221 {
222 // Setup transpose RHS
223 _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
224 _transpose_kernel_rhs->configure(&rhs_to_use, &_rhs_transposed);
225 }
226
227 // 3. Configure assembly kernel using transposed tensors.
228 // -----------------------------------------------------
229 // Use transposed tensors if the corresponding transpose flags are set
230 // Fill AsmGemmInfo class object before configuration
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100231 _gemm_info.activation_info = act_info;
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000232 _gemm_info.fast_mode = settings.fast_math();
Renato Arantes36a75da2024-01-26 17:31:18 +0000233 _gemm_info.fixed_format = settings.fixed_format();
Jakub Sujake9b3ee22023-04-17 12:08:48 +0100234 _gemm_info.negated_offsets = false;
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000235
236 lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use;
237 rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
238
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100239 // Quantized-specific configuration
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100240 if (is_data_type_quantized(lhs->data_type()))
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100241 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100242 get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
243 _gemm_info.output_stage);
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100244 }
245
Renato Arantes36a75da2024-01-26 17:31:18 +0000246 if (_gemm_info.fixed_format)
247 {
248 _gemm_info.weight_format = WeightFormat::ANY;
249 arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY;
250 ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use,
251 &rhs_to_use, nullptr, dst, _gemm_info));
252 // Set gemm weights info to the one returned by has_opt_impl
253 _gemm_info.weight_format = expected_weight_format;
254 // has_opt_impl may return a non fast math kernel, even if we requested one
255 _gemm_info.fast_mode = arm_compute::is_fixed_format_fast_math(expected_weight_format);
256 }
257
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000258 // Configure Asm Kernel
259 _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100260 _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
261 _gemm_info); // c is nullptr as bias not supported in MatMul
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000262
263 // Specify memory requirements for intermediate tensors
264 auto asm_mem_req = _asm_glue->workspace();
265 // Specify memory required by gemm kernel
266 int idx = 0;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100267 for (const auto &aux : asm_mem_req)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000268 {
269 _aux_mem[idx] = aux;
270 idx++;
271 }
272 // Memory requirements for transposed tensors
273 _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size());
274 _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size());
275}
276
277void CpuMatMul::run(ITensorPack &tensors)
278{
279 // Retrieve tensors from tensor pack
280 auto lhs = tensors.get_tensor(ACL_SRC_0);
281 auto rhs = tensors.get_const_tensor(ACL_SRC_1);
282 auto dst = tensors.get_tensor(ACL_DST);
283
284 // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
285 // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100286 lhs->info()->set_tensor_shape(
287 TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
288 _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
289 dst->info()->set_tensor_shape(
290 TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
291 _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000292 rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
293
294 // Initialise object to handle stored transposed tensors in auxillary memory
295 CpuAuxTensorHandler lhs_transposed(offset_int_vec(TransposeLHS), _lhs_transposed, tensors, true);
296 CpuAuxTensorHandler rhs_transposed(offset_int_vec(TransposeRHS), _rhs_transposed, tensors, true);
297
298 // Create tensor pack for asm kernel
299 ITensorPack asm_tensors(tensors);
300
301 // Run transpose lhs if necessary
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100302 if (_adj_lhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000303 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100304 ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
305 NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
306 lhs_transpose_pack);
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000307 asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
308 }
309 // Run transpose rhs if necessary
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100310 if (_adj_rhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000311 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100312 ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
313 NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
314 rhs_transpose_pack);
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000315 asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
316 }
317 // Run asm kernel
318 _asm_glue->run(asm_tensors);
319
320 // Undo reshape of tensors
321 dst->info()->set_tensor_shape(_original_dst_shape);
322 lhs->info()->set_tensor_shape(_original_lhs_shape);
323 rhs->info()->set_tensor_shape(_original_rhs_shape);
324}
325
326experimental::MemoryRequirements CpuMatMul::workspace() const
327{
328 return _aux_mem;
329}
330} // namespace cpu
331} // namespace arm_compute