blob: 30dce5b8fef8e2c6b02e8140f3689a514c301087 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010027#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010028#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
Gian Marco05288a22017-11-21 10:57:50 +000029#include "arm_compute/core/Error.h"
30#include "arm_compute/core/Helpers.h"
Michele Di Giorgiof64d3362020-04-03 12:40:10 +010031#include "arm_compute/core/KernelDescriptors.h"
Gian Marco05288a22017-11-21 10:57:50 +000032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Types.h"
34#include "arm_compute/core/Validate.h"
Georgios Pinitas358ca202017-12-07 16:47:52 +000035#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000036#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Gian Marco05288a22017-11-21 10:57:50 +000037#include "arm_compute/runtime/CL/CLScheduler.h"
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010038#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
Gian Marco05288a22017-11-21 10:57:50 +000039
giuros011c9efeb2019-01-11 14:04:43 +000040namespace arm_compute
41{
Georgios Pinitas358ca202017-12-07 16:47:52 +000042using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000043using namespace arm_compute::cl_gemm;
Gian Marco05288a22017-11-21 10:57:50 +000044
Gian Marco19835e52018-01-30 13:35:54 +000045namespace
46{
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010047inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
Gian Marco19835e52018-01-30 13:35:54 +000048{
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +010049 std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
50 ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
51
52 CLGEMMKernelSelectionParams params;
53 params.m = m;
54 params.n = n;
55 params.k = k;
56 params.is_rhs_constant = reshape_b_only_on_first_run;
57 params.data_type = data_type;
58
59 switch(gemm_kernel->select_kernel(params))
60 {
61 case CLGEMMKernelType::NATIVE:
62 return false;
63 case CLGEMMKernelType::RESHAPED_ONLY_RHS:
64 return true;
65 default:
66 ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
67 }
Gian Marco19835e52018-01-30 13:35:54 +000068}
69} // namespace
70
Gian Marco05288a22017-11-21 10:57:50 +000071CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Georgios Pinitas72219332018-06-05 14:56:06 +010072 : _memory_group(std::move(memory_manager)),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000073 _weights_to_qasymm8(),
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010074 _mm_native_kernel(),
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010075 _mm_reshaped_only_rhs_kernel(),
Georgios Pinitas72219332018-06-05 14:56:06 +010076 _mtx_b_reshape_kernel(),
77 _mtx_a_reduction_kernel(),
78 _mtx_b_reduction_kernel(),
79 _offset_contribution_kernel(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010080 _offset_contribution_output_stage_kernel(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000081 _qasymm8_weights(),
Georgios Pinitas72219332018-06-05 14:56:06 +010082 _vector_sum_col(),
83 _vector_sum_row(),
Georgios Pinitas72219332018-06-05 14:56:06 +010084 _tmp_b(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010085 _mm_result_s32(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000086 _gemm_output_stage_multipliers(),
87 _gemm_output_stage_shifts(),
88 _matrix_a(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010089 _original_b(nullptr),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000090 _output(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010091 _a_offset(0),
92 _b_offset(0),
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000093 _is_gemm_reshaped(true),
Georgios Pinitas72219332018-06-05 14:56:06 +010094 _reshape_b_only_on_first_run(false),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010095 _is_prepared(false),
Michele Di Giorgiob54ba282020-01-14 15:31:55 +000096 _run_output_stage(false),
97 _convert_to_qasymm8(false),
98 _run_offset_contribution(false)
Gian Marco05288a22017-11-21 10:57:50 +000099{
100}
101
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100102void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
Gian Marco05288a22017-11-21 10:57:50 +0000103{
Manuel Bottini2b84be52020-04-08 10:15:51 +0100104 configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
105}
106
107void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
108{
Georgios Pinitas358ca202017-12-07 16:47:52 +0000109 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100110 ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
Gian Marco05288a22017-11-21 10:57:50 +0000111
Georgios Pinitas72219332018-06-05 14:56:06 +0100112 _is_prepared = false;
113 _original_b = b;
Chunosov5124be52017-11-22 20:42:13 +0700114 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100115 _a_offset = a->info()->quantization_info().uniform().offset;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000116 _matrix_a = a;
117 _output = output;
118
119 _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
morgolockd13931d2020-06-23 15:49:35 +0100120 && a->info()->data_type() == DataType::QASYMM8;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000121 _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
Gian Marco05288a22017-11-21 10:57:50 +0000122
Gian Marco19835e52018-01-30 13:35:54 +0000123 // Get the GPU target
124 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco7b4d5472018-01-10 15:56:30 +0000125
Gian Marco19835e52018-01-30 13:35:54 +0000126 // Set the target for the kernels
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100127 _mm_native_kernel.set_target(gpu_target);
128 _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
Gian Marco05288a22017-11-21 10:57:50 +0000129
giuros018b6b4a92018-12-18 19:01:33 +0000130 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000131 GEMMLHSMatrixInfo lhs_info;
Gian Marco05288a22017-11-21 10:57:50 +0000132
Gian Marco19835e52018-01-30 13:35:54 +0000133 // Arguments used by GEMMReshapeInfo
134 // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
135 // in order to know how the matrices have been reshaped
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000136 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
137 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
138 const unsigned int n = b->info()->dimension(0);
139 const unsigned int k = a->info()->dimension(0);
140 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
141 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000142
143 // Check if we need to reshape the matrix A and matrix B
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100144 _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
Gian Marco19835e52018-01-30 13:35:54 +0000145
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000146 if(_convert_to_qasymm8)
147 {
148 // Set data type for converted weights
149 TensorInfo weights_info(*b->info());
150 weights_info.set_data_type(DataType::QASYMM8);
151 _qasymm8_weights.allocator()->init(weights_info);
Manuel Bottini2b84be52020-04-08 10:15:51 +0100152 _weights_to_qasymm8.configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000153 }
154
155 const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000156 if(_is_gemm_reshaped)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100157 {
Gian Marco05288a22017-11-21 10:57:50 +0000158 matrix_b = &_tmp_b;
159
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100160 if(!_reshape_b_only_on_first_run)
161 {
162 _memory_group.manage(&_tmp_b);
163 }
Gian Marco05288a22017-11-21 10:57:50 +0000164
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000165 // Pick up the GEMM configuration
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000166 // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100167 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco05288a22017-11-21 10:57:50 +0000168
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000169 // Configure reshape RHS kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100170 _mtx_b_reshape_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
Gian Marco05288a22017-11-21 10:57:50 +0000171 }
Gian Marco05288a22017-11-21 10:57:50 +0000172
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100173 // Using default reduction info
174 const GEMMLowpReductionKernelInfo reduction_info {};
175
Gian Marco05288a22017-11-21 10:57:50 +0000176 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
177 if(_a_offset != 0)
178 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000179 TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000180 _vector_sum_col.allocator()->init(info_vector_sum_col);
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100181 if(!_reshape_b_only_on_first_run)
182 {
183 _memory_group.manage(&_vector_sum_col);
184 }
Gian Marco05288a22017-11-21 10:57:50 +0000185
186 // Configure Matrix B reduction kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100187 _mtx_b_reduction_kernel.configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
Gian Marco05288a22017-11-21 10:57:50 +0000188 }
189
190 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
191 if(_b_offset != 0)
192 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000193 TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000194 _vector_sum_row.allocator()->init(info_vector_sum_row);
195 _memory_group.manage(&_vector_sum_row);
196
197 // Configure matrix A reduction kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100198 _mtx_a_reduction_kernel.configure(compile_context, a, &_vector_sum_row, reduction_info);
Gian Marco05288a22017-11-21 10:57:50 +0000199 }
200
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000201 GEMMKernelInfo gemm_kernel_info;
202 gemm_kernel_info.m = m;
203 gemm_kernel_info.n = n;
204 gemm_kernel_info.k = k;
205 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
206 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
207 gemm_kernel_info.lhs_info = lhs_info;
208 gemm_kernel_info.rhs_info = rhs_info;
209 gemm_kernel_info.a_offset = _a_offset;
210 gemm_kernel_info.b_offset = _b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100211 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
212 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
213 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100214 // Configure offset contribution kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000215 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
216
217 _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
218 _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
219
Manuel Bottini959c26d2019-12-02 16:22:35 +0000220 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
221 gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000222
223 gemm_kernel_info.output_stage = gemmlowp_output_stage;
224
225 if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
226 {
227 // Configure and tune matrix multiply kernel with fused output stage
Manuel Bottini2b84be52020-04-08 10:15:51 +0100228 _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000229 _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
230 }
231 else
232 {
233 _run_output_stage = true;
234
235 _memory_group.manage(&_mm_result_s32);
236
237 if(_is_gemm_reshaped)
238 {
Manuel Bottini2b84be52020-04-08 10:15:51 +0100239 _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000240 }
241 else
242 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100243 // Pick up the GEMM configuration
244 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000245
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100246 // Configure matrix multiply kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100247 _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100248
Manuel Bottini2b84be52020-04-08 10:15:51 +0100249 _offset_contribution_output_stage_kernel.configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output,
250 a->info()->dimension(0),
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000251 _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000252 _mm_result_s32.allocator()->allocate();
253 }
254 }
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000255
256 _gemm_output_stage_multipliers.allocator()->allocate();
257 _gemm_output_stage_shifts.allocator()->allocate();
258 // Compute GEMM output multipliers and shifts for output stage
259 _gemm_output_stage_multipliers.map();
260 _gemm_output_stage_shifts.map();
261 std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
262 std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
263 _gemm_output_stage_multipliers.unmap();
264 _gemm_output_stage_shifts.unmap();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100265 }
266 else
267 {
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000268 _run_offset_contribution = true;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000269 if(_is_gemm_reshaped)
270 {
271 // Configure and tune matrix multiply kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100272 _mm_reshaped_only_rhs_kernel.configure(compile_context, _matrix_a, matrix_b, output, gemm_kernel_info);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000273 }
274 else
275 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100276 // Pick up the GEMM configuration
277 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100278
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100279 // Configure matrix multiply kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100280 _mm_native_kernel.configure(compile_context, _matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000281 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100282
283 // Configure offset contribution kernel
Manuel Bottini2b84be52020-04-08 10:15:51 +0100284 _offset_contribution_kernel.configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset,
285 _b_offset);
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100286 }
Gian Marco05288a22017-11-21 10:57:50 +0000287
288 // Allocate tensors
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000289 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000290 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100291 if(!_reshape_b_only_on_first_run)
292 {
293 _tmp_b.allocator()->allocate();
294 }
Gian Marco05288a22017-11-21 10:57:50 +0000295 }
296
Georgios Pinitas72219332018-06-05 14:56:06 +0100297 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marco05288a22017-11-21 10:57:50 +0000298 {
299 _vector_sum_col.allocator()->allocate();
300 }
301
302 if(_b_offset != 0)
303 {
304 _vector_sum_row.allocator()->allocate();
305 }
306}
307
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100308Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000309{
Manuel Bottini959c26d2019-12-02 16:22:35 +0000310 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
311 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
Michele Di Giorgio1c1b3aa2020-04-02 17:35:42 +0100312 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
313 ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
314 ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000315 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
316 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
317
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100318 int32_t a_offset = a->quantization_info().uniform().offset;
319 int32_t b_offset = b->quantization_info().uniform().offset;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000320
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100321 const ITensorInfo *matrix_a_info = a;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100322
giuros018b6b4a92018-12-18 19:01:33 +0000323 TensorInfo tmp_b_info{};
324 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000325 GEMMLHSMatrixInfo lhs_info;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100326
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000327 // Get the GPU target
328 const GPUTarget gpu_target = CLScheduler::get().target();
329
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000330 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
331 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
332 const unsigned int n = b->dimension(0);
333 const unsigned int k = a->dimension(0);
334 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
335 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000336
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100337 bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100338
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000339 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100340
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000341 bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
342 && is_data_type_quantized_asymmetric(a->data_type());
343 TensorInfo weights_info(*b);
344 if(convert_to_qasymm8)
345 {
346 b_offset = -128;
347 weights_info.set_data_type(DataType::QASYMM8);
348 ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
349 }
350 const ITensorInfo *matrix_b_info = &weights_info;
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100351 if(reshape_matrix_b)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000352 {
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100353 matrix_b_info = &tmp_b_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000354
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000355 // Pick up the GEMM configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100356 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100357
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000358 // Validate reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000359 auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
360 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000361 }
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100362
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100363 TensorInfo info_vector_sum_col{};
364 TensorInfo info_vector_sum_row{};
Georgios Pinitas358ca202017-12-07 16:47:52 +0000365
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100366 const GEMMLowpReductionKernelInfo reduction_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000367 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
368 if(a_offset != 0)
369 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000370 info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000371
372 // Configure Matrix B reduction kernel
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100373 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000374 }
375
376 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
377 if(b_offset != 0)
378 {
379 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
380
381 // Configure matrix A reduction kernel
Michele Di Giorgiof64d3362020-04-03 12:40:10 +0100382 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000383 }
384
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000385 GEMMKernelInfo gemm_kernel_info;
386 gemm_kernel_info.m = m;
387 gemm_kernel_info.n = n;
388 gemm_kernel_info.k = k;
389 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
390 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
391 gemm_kernel_info.lhs_info = lhs_info;
392 gemm_kernel_info.rhs_info = rhs_info;
393 gemm_kernel_info.a_offset = a_offset;
394 gemm_kernel_info.b_offset = b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100395 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
396 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000397 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
398
399 const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
400
Manuel Bottini959c26d2019-12-02 16:22:35 +0000401 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
402 gemmlowp_output_stage.output_data_type = a->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000403
404 gemm_kernel_info.output_stage = gemmlowp_output_stage;
405 if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
406 {
407 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
408 a_offset == 0 ? nullptr : &info_vector_sum_col,
409 b_offset == 0 ? nullptr : &info_vector_sum_row,
410 c,
411 &gemm_output_stage_multipliers_shifts_info,
412 &gemm_output_stage_multipliers_shifts_info));
413 }
414 else
415 {
416 TensorInfo mm_result_s32_info{};
417
418 if(reshape_matrix_b)
419 {
420 // Output tensor auto inizialitation if not yet initialized
421 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
422
423 // Validate matrix multiply
424 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
425 }
426 else
427 {
428 // Output tensor auto inizialitation if not yet initialized
429 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
430
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100431 // Pick up the GEMM configuration
432 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000433
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100434 // Validate matrix multiply
435 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000436 }
437
438 // Validate offset contribution kernel
439 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
440 a_offset == 0 ? nullptr : &info_vector_sum_col,
441 b_offset == 0 ? nullptr : &info_vector_sum_row,
442 c,
443 output,
444 a_offset, b_offset,
445 gemmlowp_output_stage,
446 &gemm_output_stage_multipliers_shifts_info,
447 &gemm_output_stage_multipliers_shifts_info));
448 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100449 }
450 else
451 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100452 if(reshape_matrix_b)
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000453 {
454 // Validate matrix multiply
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000455 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000456 }
457 else
458 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100459 // Pick up the GEMM configuration
460 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100461
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100462 // Validate matrix multiply
463 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000464 }
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100465
giuros012f7c1492019-03-18 12:30:02 +0000466 if(output->total_size() != 0)
467 {
468 // Validate offset contribution kernel
469 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
470 a_offset == 0 ? nullptr : &info_vector_sum_col,
471 b_offset == 0 ? nullptr : &info_vector_sum_row,
472 c,
473 a_offset, b_offset));
474 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100475 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000476
477 return Status{};
478}
479
Gian Marco05288a22017-11-21 10:57:50 +0000480void CLGEMMLowpMatrixMultiplyCore::run()
481{
Georgios Pinitas72219332018-06-05 14:56:06 +0100482 prepare();
483
Georgios Pinitasda953f22019-04-02 17:27:03 +0100484 MemoryGroupResourceScope scope_mg(_memory_group);
Gian Marco05288a22017-11-21 10:57:50 +0000485
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000486 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000487 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100488 if(!_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700489 {
490 // Run reshape matrix B
491 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
492 }
493 }
494
Georgios Pinitas72219332018-06-05 14:56:06 +0100495 // Run matrix B reduction kernel only if _a_offset is not equal to 0
496 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700497 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100498 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
Gian Marco05288a22017-11-21 10:57:50 +0000499 }
500
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000501 // Run matrix A reduction kernel only if _b_offset is not equal to 0
502 if(_b_offset != 0)
503 {
504 CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
505 }
506
Gian Marco05288a22017-11-21 10:57:50 +0000507 // Run matrix multiply
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000508 if(_is_gemm_reshaped)
509 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100510 CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000511 }
512 else
513 {
Gian Marco Iodiceeb65f6d2020-04-15 11:42:15 +0100514 CLScheduler::get().enqueue(_mm_native_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000515 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000516 if(_run_output_stage)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100517 {
518 // Run offset contribution/output stage kernel
519 CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
520 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000521 if(_run_offset_contribution)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100522 {
523 // Run offset contribution kernel
524 CLScheduler::get().enqueue(_offset_contribution_kernel, true);
525 }
Georgios Pinitas72219332018-06-05 14:56:06 +0100526}
Chunosov5124be52017-11-22 20:42:13 +0700527
Georgios Pinitas72219332018-06-05 14:56:06 +0100528void CLGEMMLowpMatrixMultiplyCore::prepare()
529{
530 if(!_is_prepared)
531 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000532 if(_convert_to_qasymm8)
533 {
534 _qasymm8_weights.allocator()->allocate();
535 CLScheduler::get().enqueue(_weights_to_qasymm8, false);
536 }
537
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000538 if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
Georgios Pinitas72219332018-06-05 14:56:06 +0100539 {
540 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
541
542 // Run reshape kernel and mark original weights tensor as unused
543 _tmp_b.allocator()->allocate();
544 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
545 _original_b->mark_as_unused();
546 }
547
548 // Run matrix B reduction kernel only if _a_offset is not equal to 0
549 if(_a_offset != 0 && _reshape_b_only_on_first_run)
550 {
551 _vector_sum_col.allocator()->allocate();
552 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
553 }
554
555 CLScheduler::get().queue().finish();
556 _is_prepared = true;
557 }
Gian Marco05288a22017-11-21 10:57:50 +0000558}
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000559} // namespace arm_compute