blob: 89087129c37da0a0497d50aac29d14754578f4ac [file] [log] [blame]
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +00001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "src/cpu/operators/CpuMatMul.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010026
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000027#include "arm_compute/core/experimental/Types.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010028#include "arm_compute/core/Types.h"
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000029#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010030#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010031#include "arm_compute/core/Validate.h"
SiCong Li91295492023-07-21 18:16:13 +010032#include "arm_compute/function_info/MatMulInfo.h"
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000033#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010034#include "arm_compute/runtime/NEON/NEScheduler.h"
35
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000036#include "src/common/utils/Log.h"
37#include "src/core/CPP/Validate.h"
38#include "src/core/helpers/AutoConfiguration.h"
39#include "src/core/helpers/MemoryHelpers.h"
Viet-Hoa Doa62129a2023-04-26 15:38:45 +010040#include "src/core/utils/quantization/AsymmHelpers.h"
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000041#include "src/cpu/utils/CpuAuxTensorHandler.h"
42
43using namespace arm_compute::experimental;
44
45namespace arm_compute
46{
47namespace cpu
48{
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010049namespace
50{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010051Status get_gemmlowp_output_stage_info(const ITensorInfo *src,
52 const ITensorInfo *weights,
53 const ITensorInfo *dst,
54 const ActivationLayerInfo &act,
55 GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010056{
57 const auto data_type = src->data_type();
58 const QuantizationInfo oq_info = dst->quantization_info();
59 const UniformQuantizationInfo iq_unif = src->quantization_info().uniform();
60 const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform();
61 const UniformQuantizationInfo oq_unif = oq_info.uniform();
62
63 float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
64 int32_t output_multiplier;
65 int32_t output_shift;
66
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010067 ARM_COMPUTE_RETURN_ON_ERROR(
68 quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010069
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010070 int32_t type_min = 0;
71 int32_t type_max = 0;
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010072 std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
73
74 gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
75 gemmlowp_output_stage_info.gemmlowp_shift = output_shift;
76 gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset;
77 gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
Viet-Hoa Doa62129a2023-04-26 15:38:45 +010078 gemmlowp_output_stage_info.gemmlowp_min_bound = type_min;
79 gemmlowp_output_stage_info.gemmlowp_max_bound = type_max;
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010080
81 return Status{};
82}
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010083} // namespace
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +010084
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000085CpuMatMul::CpuMatMul()
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010086 : _transpose_kernel_lhs(),
87 _transpose_kernel_rhs(),
88 _asm_glue(),
89 _lhs_transposed(),
90 _rhs_transposed(),
91 _original_lhs_shape(),
92 _original_rhs_shape(),
93 _original_dst_shape()
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +000094{
95}
96
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010097Status CpuMatMul::validate(const ITensorInfo *lhs,
98 const ITensorInfo *rhs,
99 const ITensorInfo *dst,
100 const MatMulInfo &info,
101 const CpuMatMulSettings &settings,
102 const ActivationLayerInfo &act_info)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000103{
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100104 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100105 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
106 DataType::QASYMM8_SIGNED);
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000107 ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
108 ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
109 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
110 ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(lhs);
111
112 const auto adj_lhs = info.adj_lhs();
113 const auto adj_rhs = info.adj_rhs();
114
115 const ITensorInfo *lhs_to_use = lhs;
116 const ITensorInfo *rhs_to_use = rhs;
117 TensorInfo lhs_transposed{};
118 TensorInfo rhs_transposed{};
119
120 auto gemm_info = AsmGemmInfo();
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100121 gemm_info.activation_info = act_info;
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000122 gemm_info.fast_mode = settings.fast_math();
123
124 // Validate and then permute a/b
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100125 if (adj_lhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000126 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100127 auto_init_if_empty(lhs_transposed,
128 lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000129 ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
130 // Assign lhs_to_use pointer to use transposed TensorInfo
131 lhs_to_use = &lhs_transposed;
132 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100133 if (adj_rhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000134 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100135 auto_init_if_empty(rhs_transposed,
136 rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000137 ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
138 // Assign rhs_to_use pointer to use transposed TensorInfo
139 rhs_to_use = &rhs_transposed;
140 }
141
142 ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100143 "The product AB is defined only if the number of columns in A is equal to the "
144 "number of rows in B (after transpose)");
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000145
Viet-Hoa Do54e52a92023-05-02 16:20:58 +0100146 // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100147 for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000148 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100149 ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
150 "Broadcasting in Batch dimension is unsupported by this operator.");
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000151 }
152
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100153 // Quantized-specific configuration
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100154 if (is_data_type_quantized(lhs->data_type()))
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100155 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100156 ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
157 gemm_info.activation_info, gemm_info.output_stage));
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100158 }
159
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000160 cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
161
162 return Status{};
163}
164
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100165void CpuMatMul::configure(ITensorInfo *lhs,
166 ITensorInfo *rhs,
167 ITensorInfo *dst,
168 const MatMulInfo &info,
169 const CpuMatMulSettings &settings,
170 const ActivationLayerInfo &act_info)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000171{
172 ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
173 ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
174 ARM_COMPUTE_ERROR_THROW_ON(CpuMatMul::validate(lhs, rhs, dst, info, settings));
175
176 _adj_lhs = info.adj_lhs();
177 _adj_rhs = info.adj_rhs();
178 _fast_math = settings.fast_math();
179
180 // 1. Create and reshape tensors
181 // ------------------------------------------------------
182 // a. Clone TensorInfo to prevent changing original tensor values during setup
183 // b. Change shape of lhs/dst to [x, y, 1, collapsed(z)] to match assembly kernel configuration
184 // c. For rhs collapse all dimensions larger than 3 to z dimension
185 TensorInfo lhs_to_use = *lhs->clone();
186 TensorInfo dst_to_use = *dst->clone();
187 TensorInfo rhs_to_use = *rhs->clone();
188
189 // Save starting shape of tensors
190 _original_lhs_shape = lhs_to_use.tensor_shape();
191 _original_dst_shape = dst_to_use.tensor_shape();
192 _original_rhs_shape = rhs_to_use.tensor_shape();
193
194 // Reshape lhs for use with assembly kernels.
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100195 lhs_to_use.set_tensor_shape(
196 TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
197 dst_to_use.set_tensor_shape(
198 TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000199 rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
200
201 // 2. Configuration for transpose of lhs/rhs
202 // ------------------------------------------------------
203 // Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100204 if (_adj_lhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000205 {
206 // Setup transpose LHS
207 _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
208 _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
209 }
210
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100211 if (_adj_rhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000212 {
213 // Setup transpose RHS
214 _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
215 _transpose_kernel_rhs->configure(&rhs_to_use, &_rhs_transposed);
216 }
217
218 // 3. Configure assembly kernel using transposed tensors.
219 // -----------------------------------------------------
220 // Use transposed tensors if the corresponding transpose flags are set
221 // Fill AsmGemmInfo class object before configuration
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100222 _gemm_info.activation_info = act_info;
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000223 _gemm_info.fast_mode = settings.fast_math();
Jakub Sujake9b3ee22023-04-17 12:08:48 +0100224 _gemm_info.negated_offsets = false;
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000225
226 lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use;
227 rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
228
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100229 // Quantized-specific configuration
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100230 if (is_data_type_quantized(lhs->data_type()))
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100231 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100232 get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
233 _gemm_info.output_stage);
Viet-Hoa Do9c7c2d22023-04-11 17:16:27 +0100234 }
235
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000236 // Configure Asm Kernel
237 _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100238 _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
239 _gemm_info); // c is nullptr as bias not supported in MatMul
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000240
241 // Specify memory requirements for intermediate tensors
242 auto asm_mem_req = _asm_glue->workspace();
243 // Specify memory required by gemm kernel
244 int idx = 0;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100245 for (const auto &aux : asm_mem_req)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000246 {
247 _aux_mem[idx] = aux;
248 idx++;
249 }
250 // Memory requirements for transposed tensors
251 _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size());
252 _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size());
253}
254
255void CpuMatMul::run(ITensorPack &tensors)
256{
257 // Retrieve tensors from tensor pack
258 auto lhs = tensors.get_tensor(ACL_SRC_0);
259 auto rhs = tensors.get_const_tensor(ACL_SRC_1);
260 auto dst = tensors.get_tensor(ACL_DST);
261
262 // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
263 // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100264 lhs->info()->set_tensor_shape(
265 TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
266 _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
267 dst->info()->set_tensor_shape(
268 TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
269 _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000270 rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
271
272 // Initialise object to handle stored transposed tensors in auxillary memory
273 CpuAuxTensorHandler lhs_transposed(offset_int_vec(TransposeLHS), _lhs_transposed, tensors, true);
274 CpuAuxTensorHandler rhs_transposed(offset_int_vec(TransposeRHS), _rhs_transposed, tensors, true);
275
276 // Create tensor pack for asm kernel
277 ITensorPack asm_tensors(tensors);
278
279 // Run transpose lhs if necessary
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100280 if (_adj_lhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000281 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100282 ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
283 NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
284 lhs_transpose_pack);
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000285 asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
286 }
287 // Run transpose rhs if necessary
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100288 if (_adj_rhs)
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000289 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100290 ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
291 NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
292 rhs_transpose_pack);
Mohammed Suhail Munshia1b1e412023-03-23 22:21:31 +0000293 asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
294 }
295 // Run asm kernel
296 _asm_glue->run(asm_tensors);
297
298 // Undo reshape of tensors
299 dst->info()->set_tensor_shape(_original_dst_shape);
300 lhs->info()->set_tensor_shape(_original_lhs_shape);
301 rhs->info()->set_tensor_shape(_original_rhs_shape);
302}
303
304experimental::MemoryRequirements CpuMatMul::workspace() const
305{
306 return _aux_mem;
307}
308} // namespace cpu
309} // namespace arm_compute