blob: 66331bc81879ec29c365d3d724fad5888774fee8 [file] [log] [blame]
Gunes Bayir9d0c4de2023-04-13 18:22:58 +01001/*
2 * Copyright (c) 2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
25
26#include "arm_compute/core/CL/CLHelpers.h"
27#include "arm_compute/core/CL/ICLTensor.h"
28#include "arm_compute/core/ITensorPack.h"
Gunes Bayire87fa662023-09-07 12:20:33 +010029#include "arm_compute/core/QuantizationInfo.h"
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010030#include "arm_compute/core/TensorInfo.h"
Gunes Bayire87fa662023-09-07 12:20:33 +010031#include "arm_compute/core/utils/ActivationFunctionUtils.h"
32#include "arm_compute/core/utils/StringUtils.h"
Matthew Bentham314d3e22023-06-23 10:53:52 +000033#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010034#include "arm_compute/core/utils/misc/ShapeCalculator.h"
35#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
36
37#include "src/common/utils/Log.h"
38#include "src/core/helpers/AutoConfiguration.h"
39#include "src/core/helpers/WindowHelpers.h"
40#include "src/gpu/cl/ClCompileContext.h"
Gunes Bayire87fa662023-09-07 12:20:33 +010041#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +010042
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010043#include "support/Cast.h"
44#include "support/StringSupport.h"
45
46namespace arm_compute
47{
48namespace opencl
49{
50namespace kernels
51{
52namespace
53{
54Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
55{
56 const bool adj_lhs = matmul_kernel_info.adj_lhs;
57 const bool adj_rhs = matmul_kernel_info.adj_rhs;
58 const int m0 = matmul_kernel_info.m0;
59 const int n0 = matmul_kernel_info.n0;
60 const int k0 = matmul_kernel_info.k0;
61
62 // Validate M0
63 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
64
65 if(adj_lhs)
66 {
67 ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
68 }
69
70 // Validate N0
71 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
72 ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
73
74 // Validate K0
75 ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
76 if(!adj_lhs || adj_rhs)
77 {
78 ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
79 }
80
81 return Status{};
82}
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010083}
84ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
85{
86 _type = CLKernelType::GEMM;
87}
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +010088Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
89 const ActivationLayerInfo &act_info)
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010090{
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +010091 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010092 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
93 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
94 ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
Gunes Bayire87fa662023-09-07 12:20:33 +010095 ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
Gunes Bayir9d0c4de2023-04-13 18:22:58 +010096
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +010097 ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU
98 && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU),
99 "Activation Function specified is unsupported.");
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100100 const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100101
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100102 if(dst->total_size() != 0)
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100103 {
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100104 const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100105 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
106 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100107 }
108
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100109 if(bias != nullptr)
110 {
111 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
112 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
113 ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
114 }
115
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100116 return Status{};
117}
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100118void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100119 const ActivationLayerInfo &act_info)
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100120{
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100121 ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100122 ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
123 ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100124
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100125 // dst tensor auto initialization if not yet initialized
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100126 auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100127
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100128 const int m = dst->dimension(1);
129 const int n = dst->dimension(0);
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100130 const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
131 const bool adj_lhs = matmul_kernel_info.adj_lhs;
132
133 int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m);
134 int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
135
136 // Configure kernel window
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100137 Window win = calculate_max_window(*dst, Steps(n0, m0));
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100138 win = win.collapse(win, Window::DimZ);
139 IClKernel::configure_internal(win);
140
141 // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
142 const unsigned int partial_store_m0 = m % m0;
143 const unsigned int partial_store_n0 = n % n0;
144
145 CLBuildOptions build_opts;
146 build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
147 build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
148 build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
149 build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
150 build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
151 build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
152 build_opts.add_option("-DK=" + support::cpp11::to_string(k));
153
154 const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
155 const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100156 const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100157
158 float multiplier = lqinfo.scale * rqinfo.scale / dqinfo.scale;
159 int output_multiplier = 0;
160 int output_shift = 0;
161 arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
162
163 build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
164 build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
165
Mohammed Suhail Munshia2bb80e2023-06-19 14:57:57 +0100166 // Note : Offset is not negated, unlike gemmlowp kernels
167 build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(lqinfo.offset));
168 build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(rqinfo.offset));
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100169 build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));
170 build_opts.add_option_if(bias != nullptr, "-DBIAS");
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100171
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100172 // Floating point boundaries are quantized prior to being passed as arguments.
173 // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
174 int a_val{};
175 int b_val{};
176 std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, dst->data_type(), dqinfo);
177
178 build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
179 build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100180 build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
Mohammed Suhail Munshic9eeee52023-06-30 15:43:29 +0100181 build_opts.add_option("-DZERO_POINT=" + support::cpp11::to_string(dqinfo.offset));
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100182
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100183 std::string kernel_name("mat_mul_native_quantized");
184 kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
185 kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
186
187 // A macro guard to compile ONLY the kernel of interest
188 build_opts.add_option("-D" + upper_string(kernel_name));
189
190 // Create kernel
191 _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
192
193 // Set config_id for enabling LWS tuning
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100194 const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100195
196 _config_id = kernel_name;
197 _config_id += "_";
198 _config_id += lower_string(string_from_data_type(lhs->data_type()));
199 _config_id += "_";
200 _config_id += support::cpp11::to_string(m);
201 _config_id += "_";
202 _config_id += support::cpp11::to_string(n);
203 _config_id += "_";
204 _config_id += support::cpp11::to_string(k);
205 _config_id += "_";
206 _config_id += support::cpp11::to_string(number_of_batches);
207 _config_id += "_";
208 _config_id += support::cpp11::to_string(m0);
209 _config_id += "_";
210 _config_id += support::cpp11::to_string(n0);
211 _config_id += "_";
212 _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
213}
214
215void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
216{
217 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
218 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
219
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100220 const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
221 const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
222 const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
223 ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100224 ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100225 ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100226
227 unsigned int idx = 0;
228 Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
229
230 add_3d_tensor_nhw_argument(idx, lhs);
231 add_3d_tensor_nhw_argument(idx, rhs);
Mohammed Suhail Munshi8e2dede2023-06-27 14:25:58 +0100232 if(bias != nullptr)
233 {
234 add_3d_tensor_nhw_argument(idx, bias);
235 }
Mohammed Suhail Munshi94abde42023-05-25 16:48:43 +0100236 add_3d_tensor_nhw_argument(idx, dst);
Gunes Bayir9d0c4de2023-04-13 18:22:58 +0100237
238 enqueue(queue, *this, window_collapsed, lws_hint());
239}
240
241} // namespace kernels
242} // namespace opencl
243} // namespace arm_compute