blob: 7a8de6c1f533cab2defe00bd7125664ba2bd73a0 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
27#include "arm_compute/core/Error.h"
28#include "arm_compute/core/Helpers.h"
Michele Di Giorgiof64d3362020-04-03 12:40:10 +010029#include "arm_compute/core/KernelDescriptors.h"
Gian Marco05288a22017-11-21 10:57:50 +000030#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Types.h"
32#include "arm_compute/core/Validate.h"
Georgios Pinitas358ca202017-12-07 16:47:52 +000033#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000034#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Gian Marco05288a22017-11-21 10:57:50 +000035#include "arm_compute/runtime/CL/CLScheduler.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010036#include "src/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
37#include "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
38#include "src/core/helpers/AutoConfiguration.h"
39#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
Gian Marco05288a22017-11-21 10:57:50 +000040
giuros011c9efeb2019-01-11 14:04:43 +000041namespace arm_compute
42{
Georgios Pinitas358ca202017-12-07 16:47:52 +000043using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000044using namespace arm_compute::cl_gemm;
Gian Marco05288a22017-11-21 10:57:50 +000045
Gian Marco19835e52018-01-30 13:35:54 +000046namespace
47{
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010048inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
Gian Marco19835e52018-01-30 13:35:54 +000049{
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010050 std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
51 ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
52
53 CLGEMMKernelSelectionParams params;
54 params.m = m;
55 params.n = n;
56 params.k = k;
57 params.is_rhs_constant = reshape_b_only_on_first_run;
58 params.data_type = data_type;
59
60 switch(gemm_kernel->select_kernel(params))
61 {
62 case CLGEMMKernelType::NATIVE:
63 return false;
64 case CLGEMMKernelType::RESHAPED_ONLY_RHS:
65 return true;
66 default:
67 ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
68 }
Gian Marco19835e52018-01-30 13:35:54 +000069}
70} // namespace
71
Gian Marco05288a22017-11-21 10:57:50 +000072CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Georgios Pinitas72219332018-06-05 14:56:06 +010073 : _memory_group(std::move(memory_manager)),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000074 _weights_to_qasymm8(),
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010075 _mm_native_kernel(),
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010076 _mm_reshaped_only_rhs_kernel(),
Georgios Pinitas72219332018-06-05 14:56:06 +010077 _mtx_b_reshape_kernel(),
78 _mtx_a_reduction_kernel(),
79 _mtx_b_reduction_kernel(),
80 _offset_contribution_kernel(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010081 _offset_contribution_output_stage_kernel(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000082 _qasymm8_weights(),
Georgios Pinitas72219332018-06-05 14:56:06 +010083 _vector_sum_col(),
84 _vector_sum_row(),
Georgios Pinitas72219332018-06-05 14:56:06 +010085 _tmp_b(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010086 _mm_result_s32(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000087 _gemm_output_stage_multipliers(),
88 _gemm_output_stage_shifts(),
89 _matrix_a(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010090 _original_b(nullptr),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000091 _output(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010092 _a_offset(0),
93 _b_offset(0),
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000094 _is_gemm_reshaped(true),
Georgios Pinitas72219332018-06-05 14:56:06 +010095 _reshape_b_only_on_first_run(false),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010096 _is_prepared(false),
Michele Di Giorgiob54ba282020-01-14 15:31:55 +000097 _run_output_stage(false),
98 _convert_to_qasymm8(false),
99 _run_offset_contribution(false)
Gian Marco05288a22017-11-21 10:57:50 +0000100{
101}
102
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100103void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
Gian Marco05288a22017-11-21 10:57:50 +0000104{
Manuel Bottini2b84be52020-04-08 10:15:51 +0100105 configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
106}
107
108void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
109{
Georgios Pinitas358ca202017-12-07 16:47:52 +0000110 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100111 ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
Gian Marco05288a22017-11-21 10:57:50 +0000112
Georgios Pinitas72219332018-06-05 14:56:06 +0100113 _is_prepared = false;
114 _original_b = b;
Chunosov5124be52017-11-22 20:42:13 +0700115 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100116 _a_offset = a->info()->quantization_info().uniform().offset;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000117 _matrix_a = a;
118 _output = output;
119
120 _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
morgolockd13931d2020-06-23 15:49:35 +0100121 && a->info()->data_type() == DataType::QASYMM8;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000122 _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
Gian Marco05288a22017-11-21 10:57:50 +0000123
Gian Marco19835e52018-01-30 13:35:54 +0000124 // Get the GPU target
125 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco7b4d5472018-01-10 15:56:30 +0000126
Gian Marco19835e52018-01-30 13:35:54 +0000127 // Set the target for the kernels
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100128 _mm_native_kernel.set_target(gpu_target);
129 _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
Gian Marco05288a22017-11-21 10:57:50 +0000130
giuros018b6b4a92018-12-18 19:01:33 +0000131 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000132 GEMMLHSMatrixInfo lhs_info;
Gian Marco05288a22017-11-21 10:57:50 +0000133
Gian Marco19835e52018-01-30 13:35:54 +0000134 // Arguments used by GEMMReshapeInfo
135 // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
136 // in order to know how the matrices have been reshaped
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000137 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
138 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
139 const unsigned int n = b->info()->dimension(0);
140 const unsigned int k = a->info()->dimension(0);
141 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
142 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000143
144 // Check if we need to reshape the matrix A and matrix B
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100145 _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
Gian Marco19835e52018-01-30 13:35:54 +0000146
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000147 if(_convert_to_qasymm8)
148 {
149 // Set data type for converted weights
150 TensorInfo weights_info(*b->info());
151 weights_info.set_data_type(DataType::QASYMM8);
152 _qasymm8_weights.allocator()->init(weights_info);
Manuel Bottini2b84be52020-04-08 10:15:51 +0100153 _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000154 }
155
156 const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000157 if(_is_gemm_reshaped)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100158 {
Gian Marco05288a22017-11-21 10:57:50 +0000159 matrix_b = &_tmp_b;
160
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100161 if(!_reshape_b_only_on_first_run)
162 {
163 _memory_group.manage(&_tmp_b);
164 }
Gian Marco05288a22017-11-21 10:57:50 +0000165
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000166 // Pick up the GEMM configuration
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000167 // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100168 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco05288a22017-11-21 10:57:50 +0000169
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000170 // Configure reshape RHS kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100171 _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
Gian Marco05288a22017-11-21 10:57:50 +0000172 }
Gian Marco05288a22017-11-21 10:57:50 +0000173
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100174 // Using default reduction info
175 const GEMMLowpReductionKernelInfo reduction_info {};
176
Gian Marco05288a22017-11-21 10:57:50 +0000177 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
178 if(_a_offset != 0)
179 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000180 TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000181 _vector_sum_col.allocator()->init(info_vector_sum_col);
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100182 if(!_reshape_b_only_on_first_run)
183 {
184 _memory_group.manage(&_vector_sum_col);
185 }
Gian Marco05288a22017-11-21 10:57:50 +0000186
187 // Configure Matrix B reduction kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100188 _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
Gian Marco05288a22017-11-21 10:57:50 +0000189 }
190
191 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
192 if(_b_offset != 0)
193 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000194 TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000195 _vector_sum_row.allocator()->init(info_vector_sum_row);
196 _memory_group.manage(&_vector_sum_row);
197
198 // Configure matrix A reduction kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100199 _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
Gian Marco05288a22017-11-21 10:57:50 +0000200 }
201
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000202 GEMMKernelInfo gemm_kernel_info;
203 gemm_kernel_info.m = m;
204 gemm_kernel_info.n = n;
205 gemm_kernel_info.k = k;
206 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
207 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
208 gemm_kernel_info.lhs_info = lhs_info;
209 gemm_kernel_info.rhs_info = rhs_info;
210 gemm_kernel_info.a_offset = _a_offset;
211 gemm_kernel_info.b_offset = _b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100212 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
213 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
214 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100215 // Configure offset contribution kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000216 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
217
218 _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
219 _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
220
Manuel Bottini959c26d2019-12-02 16:22:35 +0000221 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
222 gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000223
224 gemm_kernel_info.output_stage = gemmlowp_output_stage;
225
226 if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
227 {
228 // Configure and tune matrix multiply kernel with fused output stage
Manuel Bottini2b84be52020-04-08 10:15:51 +0100229 _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000230 _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
231 }
232 else
233 {
234 _run_output_stage = true;
235
236 _memory_group.manage(&_mm_result_s32);
237
238 if(_is_gemm_reshaped)
239 {
Manuel Bottini2b84be52020-04-08 10:15:51 +0100240 _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000241 }
242 else
243 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100244 // Pick up the GEMM configuration
245 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000246
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100247 // Configure matrix multiply kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100248 _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100249
Manuel Bottini2b84be52020-04-08 10:15:51 +0100250 _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
251 a->info()->dimension(0),
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000252 _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000253 _mm_result_s32.allocator()->allocate();
254 }
255 }
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000256
257 _gemm_output_stage_multipliers.allocator()->allocate();
258 _gemm_output_stage_shifts.allocator()->allocate();
259 // Compute GEMM output multipliers and shifts for output stage
260 _gemm_output_stage_multipliers.map();
261 _gemm_output_stage_shifts.map();
262 std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
263 std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
264 _gemm_output_stage_multipliers.unmap();
265 _gemm_output_stage_shifts.unmap();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100266 }
267 else
268 {
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000269 _run_offset_contribution = true;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000270 if(_is_gemm_reshaped)
271 {
272 // Configure and tune matrix multiply kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100273 _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000274 }
275 else
276 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100277 // Pick up the GEMM configuration
278 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100279
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100280 // Configure matrix multiply kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100281 _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000282 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100283
284 // Configure offset contribution kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100285 _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
286 _b_offset);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100287 }
Gian Marco05288a22017-11-21 10:57:50 +0000288
289 // Allocate tensors
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000290 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000291 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100292 if(!_reshape_b_only_on_first_run)
293 {
294 _tmp_b.allocator()->allocate();
295 }
Gian Marco05288a22017-11-21 10:57:50 +0000296 }
297
Georgios Pinitas72219332018-06-05 14:56:06 +0100298 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marco05288a22017-11-21 10:57:50 +0000299 {
300 _vector_sum_col.allocator()->allocate();
301 }
302
303 if(_b_offset != 0)
304 {
305 _vector_sum_row.allocator()->allocate();
306 }
307}
308
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100309Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000310{
Manuel Bottini959c26d2019-12-02 16:22:35 +0000311 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
312 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100313 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
314 ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
315 ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000316 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
317 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
318
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100319 int32_t a_offset = a->quantization_info().uniform().offset;
320 int32_t b_offset = b->quantization_info().uniform().offset;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000321
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100322 const ITensorInfo *matrix_a_info = a;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100323
giuros018b6b4a92018-12-18 19:01:33 +0000324 TensorInfo tmp_b_info{};
325 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000326 GEMMLHSMatrixInfo lhs_info;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100327
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000328 // Get the GPU target
329 const GPUTarget gpu_target = CLScheduler::get().target();
330
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000331 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
332 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
333 const unsigned int n = b->dimension(0);
334 const unsigned int k = a->dimension(0);
335 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
336 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000337
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100338 bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100339
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000340 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100341
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000342 bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
343 && is_data_type_quantized_asymmetric(a->data_type());
344 TensorInfo weights_info(*b);
345 if(convert_to_qasymm8)
346 {
347 b_offset = -128;
348 weights_info.set_data_type(DataType::QASYMM8);
349 ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
350 }
351 const ITensorInfo *matrix_b_info = &weights_info;
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100352 if(reshape_matrix_b)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000353 {
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100354 matrix_b_info = &tmp_b_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000355
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000356 // Pick up the GEMM configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100357 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100358
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000359 // Validate reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000360 auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
361 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000362 }
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100363
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100364 TensorInfo info_vector_sum_col{};
365 TensorInfo info_vector_sum_row{};
Georgios Pinitas358ca202017-12-07 16:47:52 +0000366
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100367 const GEMMLowpReductionKernelInfo reduction_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000368 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
369 if(a_offset != 0)
370 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000371 info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000372
373 // Configure Matrix B reduction kernel
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100374 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000375 }
376
377 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
378 if(b_offset != 0)
379 {
380 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
381
382 // Configure matrix A reduction kernel
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100383 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000384 }
385
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000386 GEMMKernelInfo gemm_kernel_info;
387 gemm_kernel_info.m = m;
388 gemm_kernel_info.n = n;
389 gemm_kernel_info.k = k;
390 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
391 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
392 gemm_kernel_info.lhs_info = lhs_info;
393 gemm_kernel_info.rhs_info = rhs_info;
394 gemm_kernel_info.a_offset = a_offset;
395 gemm_kernel_info.b_offset = b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100396 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
397 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000398 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
399
400 const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
401
Manuel Bottini959c26d2019-12-02 16:22:35 +0000402 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
403 gemmlowp_output_stage.output_data_type = a->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000404
405 gemm_kernel_info.output_stage = gemmlowp_output_stage;
406 if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
407 {
408 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
409 a_offset == 0 ? nullptr : &info_vector_sum_col,
410 b_offset == 0 ? nullptr : &info_vector_sum_row,
411 c,
412 &gemm_output_stage_multipliers_shifts_info,
413 &gemm_output_stage_multipliers_shifts_info));
414 }
415 else
416 {
417 TensorInfo mm_result_s32_info{};
418
419 if(reshape_matrix_b)
420 {
421 // Output tensor auto inizialitation if not yet initialized
422 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
423
424 // Validate matrix multiply
425 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
426 }
427 else
428 {
429 // Output tensor auto inizialitation if not yet initialized
430 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
431
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100432 // Pick up the GEMM configuration
433 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000434
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100435 // Validate matrix multiply
436 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000437 }
438
439 // Validate offset contribution kernel
440 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
441 a_offset == 0 ? nullptr : &info_vector_sum_col,
442 b_offset == 0 ? nullptr : &info_vector_sum_row,
443 c,
444 output,
445 a_offset, b_offset,
446 gemmlowp_output_stage,
447 &gemm_output_stage_multipliers_shifts_info,
448 &gemm_output_stage_multipliers_shifts_info));
449 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100450 }
451 else
452 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100453 if(reshape_matrix_b)
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000454 {
455 // Validate matrix multiply
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000456 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000457 }
458 else
459 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100460 // Pick up the GEMM configuration
461 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100462
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100463 // Validate matrix multiply
464 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000465 }
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100466
giuros012f7c1492019-03-18 12:30:02 +0000467 if(output->total_size() != 0)
468 {
469 // Validate offset contribution kernel
470 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
471 a_offset == 0 ? nullptr : &info_vector_sum_col,
472 b_offset == 0 ? nullptr : &info_vector_sum_row,
473 c,
474 a_offset, b_offset));
475 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100476 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000477
478 return Status{};
479}
480
Gian Marco05288a22017-11-21 10:57:50 +0000481void CLGEMMLowpMatrixMultiplyCore::run()
482{
Georgios Pinitas72219332018-06-05 14:56:06 +0100483 prepare();
484
Georgios Pinitasda953f22019-04-02 17:27:03 +0100485 MemoryGroupResourceScope scope_mg(_memory_group);
Gian Marco05288a22017-11-21 10:57:50 +0000486
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000487 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000488 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100489 if(!_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700490 {
491 // Run reshape matrix B
492 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
493 }
494 }
495
Georgios Pinitas72219332018-06-05 14:56:06 +0100496 // Run matrix B reduction kernel only if _a_offset is not equal to 0
497 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700498 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100499 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
Gian Marco05288a22017-11-21 10:57:50 +0000500 }
501
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000502 // Run matrix A reduction kernel only if _b_offset is not equal to 0
503 if(_b_offset != 0)
504 {
505 CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
506 }
507
Gian Marco05288a22017-11-21 10:57:50 +0000508 // Run matrix multiply
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000509 if(_is_gemm_reshaped)
510 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100511 CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000512 }
513 else
514 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100515 CLScheduler::get().enqueue(_mm_native_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000516 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000517 if(_run_output_stage)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100518 {
519 // Run offset contribution/output stage kernel
520 CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
521 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000522 if(_run_offset_contribution)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100523 {
524 // Run offset contribution kernel
525 CLScheduler::get().enqueue(_offset_contribution_kernel, true);
526 }
Georgios Pinitas72219332018-06-05 14:56:06 +0100527}
Chunosov5124be52017-11-22 20:42:13 +0700528
Georgios Pinitas72219332018-06-05 14:56:06 +0100529void CLGEMMLowpMatrixMultiplyCore::prepare()
530{
531 if(!_is_prepared)
532 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000533 if(_convert_to_qasymm8)
534 {
535 _qasymm8_weights.allocator()->allocate();
536 CLScheduler::get().enqueue(_weights_to_qasymm8, false);
537 }
538
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000539 if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
Georgios Pinitas72219332018-06-05 14:56:06 +0100540 {
541 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
542
543 // Run reshape kernel and mark original weights tensor as unused
544 _tmp_b.allocator()->allocate();
545 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
546 _original_b->mark_as_unused();
547 }
548
549 // Run matrix B reduction kernel only if _a_offset is not equal to 0
550 if(_a_offset != 0 && _reshape_b_only_on_first_run)
551 {
552 _vector_sum_col.allocator()->allocate();
553 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
554 }
555
556 CLScheduler::get().queue().finish();
557 _is_prepared = true;
558 }
Gian Marco05288a22017-11-21 10:57:50 +0000559}
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000560} // namespace arm_compute