blob: d3d80a39e38579b8fb68abca3b31a85cdf244c44 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
Michele Di Giorgiof64d3362020-04-03 12:40:10 +010029#include "arm_compute/core/KernelDescriptors.h"
Gian Marco05288a22017-11-21 10:57:50 +000030#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Validate.h"
Georgios Pinitas358ca202017-12-07 16:47:52 +000033#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000034#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Gian Marco05288a22017-11-21 10:57:50 +000035#include "arm_compute/runtime/CL/CLScheduler.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010036#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
37#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +010038#include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
39#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
40#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
41#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
42#include "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
43#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h"
44#include "src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010045#include "src/core/helpers/AutoConfiguration.h"
46#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +010047#include "support/MemorySupport.h"
Gian Marco05288a22017-11-21 10:57:50 +000048
giuros011c9efeb2019-01-11 14:04:43 +000049namespace arm_compute
50{
Georgios Pinitas358ca202017-12-07 16:47:52 +000051using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000052using namespace arm_compute::cl_gemm;
Gian Marco05288a22017-11-21 10:57:50 +000053
Gian Marco19835e52018-01-30 13:35:54 +000054namespace
55{
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010056inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
Gian Marco19835e52018-01-30 13:35:54 +000057{
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010058 std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
59 ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
60
61 CLGEMMKernelSelectionParams params;
62 params.m = m;
63 params.n = n;
64 params.k = k;
65 params.is_rhs_constant = reshape_b_only_on_first_run;
66 params.data_type = data_type;
67
68 switch(gemm_kernel->select_kernel(params))
69 {
70 case CLGEMMKernelType::NATIVE:
71 return false;
72 case CLGEMMKernelType::RESHAPED_ONLY_RHS:
73 return true;
74 default:
75 ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
76 }
Gian Marco19835e52018-01-30 13:35:54 +000077}
78} // namespace
79
Gian Marco05288a22017-11-21 10:57:50 +000080CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Georgios Pinitas72219332018-06-05 14:56:06 +010081 : _memory_group(std::move(memory_manager)),
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +010082 _weights_to_qasymm8(support::cpp14::make_unique<CLDepthConvertLayerKernel>()),
83 _mm_native_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixMultiplyNativeKernel>()),
84 _mm_reshaped_only_rhs_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>()),
85 _mtx_b_reshape_kernel(support::cpp14::make_unique<CLGEMMReshapeRHSMatrixKernel>()),
86 _mtx_a_reduction_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
87 _mtx_b_reduction_kernel(support::cpp14::make_unique<CLGEMMLowpMatrixBReductionKernel>()),
88 _offset_contribution_kernel(support::cpp14::make_unique<CLGEMMLowpOffsetContributionKernel>()),
89 _offset_contribution_output_stage_kernel(support::cpp14::make_unique<CLGEMMLowpOffsetContributionOutputStageKernel>()),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000090 _qasymm8_weights(),
Georgios Pinitas72219332018-06-05 14:56:06 +010091 _vector_sum_col(),
92 _vector_sum_row(),
Georgios Pinitas72219332018-06-05 14:56:06 +010093 _tmp_b(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010094 _mm_result_s32(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000095 _gemm_output_stage_multipliers(),
96 _gemm_output_stage_shifts(),
97 _matrix_a(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010098 _original_b(nullptr),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000099 _output(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +0100100 _a_offset(0),
101 _b_offset(0),
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000102 _is_gemm_reshaped(true),
Georgios Pinitas72219332018-06-05 14:56:06 +0100103 _reshape_b_only_on_first_run(false),
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100104 _is_prepared(false),
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000105 _run_output_stage(false),
106 _convert_to_qasymm8(false),
107 _run_offset_contribution(false)
Gian Marco05288a22017-11-21 10:57:50 +0000108{
109}
110
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100111CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
112
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100113void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
Gian Marco05288a22017-11-21 10:57:50 +0000114{
Manuel Bottini2b84be52020-04-08 10:15:51 +0100115 configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
116}
117
118void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
119{
Georgios Pinitas358ca202017-12-07 16:47:52 +0000120 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100121 ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
Gian Marco05288a22017-11-21 10:57:50 +0000122
Georgios Pinitas72219332018-06-05 14:56:06 +0100123 _is_prepared = false;
124 _original_b = b;
Chunosov5124be52017-11-22 20:42:13 +0700125 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100126 _a_offset = a->info()->quantization_info().uniform().offset;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000127 _matrix_a = a;
128 _output = output;
129
130 _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
morgolockd13931d2020-06-23 15:49:35 +0100131 && a->info()->data_type() == DataType::QASYMM8;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000132 _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
Gian Marco05288a22017-11-21 10:57:50 +0000133
Gian Marco19835e52018-01-30 13:35:54 +0000134 // Get the GPU target
135 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco7b4d5472018-01-10 15:56:30 +0000136
Gian Marco19835e52018-01-30 13:35:54 +0000137 // Set the target for the kernels
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100138 _mm_native_kernel->set_target(gpu_target);
139 _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
Gian Marco05288a22017-11-21 10:57:50 +0000140
giuros018b6b4a92018-12-18 19:01:33 +0000141 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000142 GEMMLHSMatrixInfo lhs_info;
Gian Marco05288a22017-11-21 10:57:50 +0000143
Gian Marco19835e52018-01-30 13:35:54 +0000144 // Arguments used by GEMMReshapeInfo
145 // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
146 // in order to know how the matrices have been reshaped
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000147 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
148 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
149 const unsigned int n = b->info()->dimension(0);
150 const unsigned int k = a->info()->dimension(0);
151 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
152 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000153
154 // Check if we need to reshape the matrix A and matrix B
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100155 _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
Gian Marco19835e52018-01-30 13:35:54 +0000156
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000157 if(_convert_to_qasymm8)
158 {
159 // Set data type for converted weights
160 TensorInfo weights_info(*b->info());
161 weights_info.set_data_type(DataType::QASYMM8);
162 _qasymm8_weights.allocator()->init(weights_info);
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100163 _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000164 }
165
166 const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000167 if(_is_gemm_reshaped)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100168 {
Gian Marco05288a22017-11-21 10:57:50 +0000169 matrix_b = &_tmp_b;
170
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100171 if(!_reshape_b_only_on_first_run)
172 {
173 _memory_group.manage(&_tmp_b);
174 }
Gian Marco05288a22017-11-21 10:57:50 +0000175
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000176 // Pick up the GEMM configuration
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000177 // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100178 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco05288a22017-11-21 10:57:50 +0000179
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000180 // Configure reshape RHS kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100181 _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
Gian Marco05288a22017-11-21 10:57:50 +0000182 }
Gian Marco05288a22017-11-21 10:57:50 +0000183
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100184 // Using default reduction info
185 const GEMMLowpReductionKernelInfo reduction_info {};
186
Gian Marco05288a22017-11-21 10:57:50 +0000187 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
188 if(_a_offset != 0)
189 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000190 TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000191 _vector_sum_col.allocator()->init(info_vector_sum_col);
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100192 if(!_reshape_b_only_on_first_run)
193 {
194 _memory_group.manage(&_vector_sum_col);
195 }
Gian Marco05288a22017-11-21 10:57:50 +0000196
197 // Configure Matrix B reduction kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100198 _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
Gian Marco05288a22017-11-21 10:57:50 +0000199 }
200
201 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
202 if(_b_offset != 0)
203 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000204 TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000205 _vector_sum_row.allocator()->init(info_vector_sum_row);
206 _memory_group.manage(&_vector_sum_row);
207
208 // Configure matrix A reduction kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100209 _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
Gian Marco05288a22017-11-21 10:57:50 +0000210 }
211
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000212 GEMMKernelInfo gemm_kernel_info;
213 gemm_kernel_info.m = m;
214 gemm_kernel_info.n = n;
215 gemm_kernel_info.k = k;
216 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
217 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
218 gemm_kernel_info.lhs_info = lhs_info;
219 gemm_kernel_info.rhs_info = rhs_info;
220 gemm_kernel_info.a_offset = _a_offset;
221 gemm_kernel_info.b_offset = _b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100222 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
223 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
224 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100225 // Configure offset contribution kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000226 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
227
228 _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
229 _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
230
Manuel Bottini959c26d2019-12-02 16:22:35 +0000231 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
232 gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000233
234 gemm_kernel_info.output_stage = gemmlowp_output_stage;
235
236 if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
237 {
238 // Configure and tune matrix multiply kernel with fused output stage
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100239 _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
240 _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000241 }
242 else
243 {
244 _run_output_stage = true;
245
246 _memory_group.manage(&_mm_result_s32);
247
248 if(_is_gemm_reshaped)
249 {
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100250 _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000251 }
252 else
253 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100254 // Pick up the GEMM configuration
255 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000256
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100257 // Configure matrix multiply kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100258 _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100259
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100260 _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
261 a->info()->dimension(0),
262 _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000263 _mm_result_s32.allocator()->allocate();
264 }
265 }
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000266
267 _gemm_output_stage_multipliers.allocator()->allocate();
268 _gemm_output_stage_shifts.allocator()->allocate();
269 // Compute GEMM output multipliers and shifts for output stage
270 _gemm_output_stage_multipliers.map();
271 _gemm_output_stage_shifts.map();
272 std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
273 std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
274 _gemm_output_stage_multipliers.unmap();
275 _gemm_output_stage_shifts.unmap();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100276 }
277 else
278 {
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000279 _run_offset_contribution = true;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000280 if(_is_gemm_reshaped)
281 {
282 // Configure and tune matrix multiply kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100283 _mm_reshaped_only_rhs_kernel->configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000284 }
285 else
286 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100287 // Pick up the GEMM configuration
288 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100289
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100290 // Configure matrix multiply kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100291 _mm_native_kernel->configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000292 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100293
294 // Configure offset contribution kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100295 _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
296 _b_offset);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100297 }
Gian Marco05288a22017-11-21 10:57:50 +0000298
299 // Allocate tensors
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000300 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000301 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100302 if(!_reshape_b_only_on_first_run)
303 {
304 _tmp_b.allocator()->allocate();
305 }
Gian Marco05288a22017-11-21 10:57:50 +0000306 }
307
Georgios Pinitas72219332018-06-05 14:56:06 +0100308 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marco05288a22017-11-21 10:57:50 +0000309 {
310 _vector_sum_col.allocator()->allocate();
311 }
312
313 if(_b_offset != 0)
314 {
315 _vector_sum_row.allocator()->allocate();
316 }
317}
318
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100319Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000320{
Manuel Bottini959c26d2019-12-02 16:22:35 +0000321 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
322 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100323 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
324 ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
325 ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000326 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
327 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
328
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100329 int32_t a_offset = a->quantization_info().uniform().offset;
330 int32_t b_offset = b->quantization_info().uniform().offset;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000331
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100332 const ITensorInfo *matrix_a_info = a;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100333
giuros018b6b4a92018-12-18 19:01:33 +0000334 TensorInfo tmp_b_info{};
335 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000336 GEMMLHSMatrixInfo lhs_info;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100337
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000338 // Get the GPU target
339 const GPUTarget gpu_target = CLScheduler::get().target();
340
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000341 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
342 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
343 const unsigned int n = b->dimension(0);
344 const unsigned int k = a->dimension(0);
345 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
346 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000347
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100348 bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100349
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000350 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100351
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000352 bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
353 && is_data_type_quantized_asymmetric(a->data_type());
354 TensorInfo weights_info(*b);
355 if(convert_to_qasymm8)
356 {
357 b_offset = -128;
358 weights_info.set_data_type(DataType::QASYMM8);
359 ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
360 }
361 const ITensorInfo *matrix_b_info = &weights_info;
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100362 if(reshape_matrix_b)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000363 {
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100364 matrix_b_info = &tmp_b_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000365
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000366 // Pick up the GEMM configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100367 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100368
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000369 // Validate reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000370 auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
371 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000372 }
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100373
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100374 TensorInfo info_vector_sum_col{};
375 TensorInfo info_vector_sum_row{};
Georgios Pinitas358ca202017-12-07 16:47:52 +0000376
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100377 const GEMMLowpReductionKernelInfo reduction_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000378 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
379 if(a_offset != 0)
380 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000381 info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000382
383 // Configure Matrix B reduction kernel
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100384 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000385 }
386
387 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
388 if(b_offset != 0)
389 {
390 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
391
392 // Configure matrix A reduction kernel
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100393 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000394 }
395
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000396 GEMMKernelInfo gemm_kernel_info;
397 gemm_kernel_info.m = m;
398 gemm_kernel_info.n = n;
399 gemm_kernel_info.k = k;
400 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
401 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
402 gemm_kernel_info.lhs_info = lhs_info;
403 gemm_kernel_info.rhs_info = rhs_info;
404 gemm_kernel_info.a_offset = a_offset;
405 gemm_kernel_info.b_offset = b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100406 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
407 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000408 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
409
410 const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
411
Manuel Bottini959c26d2019-12-02 16:22:35 +0000412 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
413 gemmlowp_output_stage.output_data_type = a->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000414
415 gemm_kernel_info.output_stage = gemmlowp_output_stage;
416 if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
417 {
418 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
419 a_offset == 0 ? nullptr : &info_vector_sum_col,
420 b_offset == 0 ? nullptr : &info_vector_sum_row,
421 c,
422 &gemm_output_stage_multipliers_shifts_info,
423 &gemm_output_stage_multipliers_shifts_info));
424 }
425 else
426 {
427 TensorInfo mm_result_s32_info{};
428
429 if(reshape_matrix_b)
430 {
431 // Output tensor auto inizialitation if not yet initialized
432 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
433
434 // Validate matrix multiply
435 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
436 }
437 else
438 {
439 // Output tensor auto inizialitation if not yet initialized
440 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
441
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100442 // Pick up the GEMM configuration
443 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000444
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100445 // Validate matrix multiply
446 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000447 }
448
449 // Validate offset contribution kernel
450 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
451 a_offset == 0 ? nullptr : &info_vector_sum_col,
452 b_offset == 0 ? nullptr : &info_vector_sum_row,
453 c,
454 output,
455 a_offset, b_offset,
456 gemmlowp_output_stage,
457 &gemm_output_stage_multipliers_shifts_info,
458 &gemm_output_stage_multipliers_shifts_info));
459 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100460 }
461 else
462 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100463 if(reshape_matrix_b)
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000464 {
465 // Validate matrix multiply
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000466 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000467 }
468 else
469 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100470 // Pick up the GEMM configuration
471 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100472
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100473 // Validate matrix multiply
474 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000475 }
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100476
giuros012f7c1492019-03-18 12:30:02 +0000477 if(output->total_size() != 0)
478 {
479 // Validate offset contribution kernel
480 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
481 a_offset == 0 ? nullptr : &info_vector_sum_col,
482 b_offset == 0 ? nullptr : &info_vector_sum_row,
483 c,
484 a_offset, b_offset));
485 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100486 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000487
488 return Status{};
489}
490
Gian Marco05288a22017-11-21 10:57:50 +0000491void CLGEMMLowpMatrixMultiplyCore::run()
492{
Georgios Pinitas72219332018-06-05 14:56:06 +0100493 prepare();
494
Georgios Pinitasda953f22019-04-02 17:27:03 +0100495 MemoryGroupResourceScope scope_mg(_memory_group);
Gian Marco05288a22017-11-21 10:57:50 +0000496
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000497 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000498 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100499 if(!_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700500 {
501 // Run reshape matrix B
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100502 CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
Chunosov5124be52017-11-22 20:42:13 +0700503 }
504 }
505
Georgios Pinitas72219332018-06-05 14:56:06 +0100506 // Run matrix B reduction kernel only if _a_offset is not equal to 0
507 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700508 {
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100509 CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
Gian Marco05288a22017-11-21 10:57:50 +0000510 }
511
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000512 // Run matrix A reduction kernel only if _b_offset is not equal to 0
513 if(_b_offset != 0)
514 {
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100515 CLScheduler::get().enqueue(*_mtx_a_reduction_kernel, false);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000516 }
517
Gian Marco05288a22017-11-21 10:57:50 +0000518 // Run matrix multiply
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000519 if(_is_gemm_reshaped)
520 {
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100521 CLScheduler::get().enqueue(*_mm_reshaped_only_rhs_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000522 }
523 else
524 {
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100525 CLScheduler::get().enqueue(*_mm_native_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000526 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000527 if(_run_output_stage)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100528 {
529 // Run offset contribution/output stage kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100530 CLScheduler::get().enqueue(*_offset_contribution_output_stage_kernel, true);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100531 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000532 if(_run_offset_contribution)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100533 {
534 // Run offset contribution kernel
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100535 CLScheduler::get().enqueue(*_offset_contribution_kernel, true);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100536 }
Georgios Pinitas72219332018-06-05 14:56:06 +0100537}
Chunosov5124be52017-11-22 20:42:13 +0700538
Georgios Pinitas72219332018-06-05 14:56:06 +0100539void CLGEMMLowpMatrixMultiplyCore::prepare()
540{
541 if(!_is_prepared)
542 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000543 if(_convert_to_qasymm8)
544 {
545 _qasymm8_weights.allocator()->allocate();
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100546 CLScheduler::get().enqueue(*_weights_to_qasymm8, false);
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000547 }
548
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000549 if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
Georgios Pinitas72219332018-06-05 14:56:06 +0100550 {
551 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
552
553 // Run reshape kernel and mark original weights tensor as unused
554 _tmp_b.allocator()->allocate();
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100555 CLScheduler::get().enqueue(*_mtx_b_reshape_kernel, false);
Georgios Pinitas72219332018-06-05 14:56:06 +0100556 _original_b->mark_as_unused();
557 }
558
559 // Run matrix B reduction kernel only if _a_offset is not equal to 0
560 if(_a_offset != 0 && _reshape_b_only_on_first_run)
561 {
562 _vector_sum_col.allocator()->allocate();
Sang-Hoon Parkbef7fa22020-10-21 15:58:54 +0100563 CLScheduler::get().enqueue(*_mtx_b_reduction_kernel, false);
Georgios Pinitas72219332018-06-05 14:56:06 +0100564 }
565
566 CLScheduler::get().queue().finish();
567 _is_prepared = true;
568 }
Gian Marco05288a22017-11-21 10:57:50 +0000569}
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000570} // namespace arm_compute