blob: 9346e9357c62206f620cf19aea538b1d242a09da [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Manuel Bottini959c26d2019-12-02 16:22:35 +00002 * Copyright (c) 2017-2020 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010027#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010028#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
Gian Marco05288a22017-11-21 10:57:50 +000029#include "arm_compute/core/Error.h"
30#include "arm_compute/core/Helpers.h"
31#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Types.h"
33#include "arm_compute/core/Validate.h"
Georgios Pinitas358ca202017-12-07 16:47:52 +000034#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000035#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Gian Marco05288a22017-11-21 10:57:50 +000036#include "arm_compute/runtime/CL/CLScheduler.h"
37
giuros011c9efeb2019-01-11 14:04:43 +000038namespace arm_compute
39{
Georgios Pinitas358ca202017-12-07 16:47:52 +000040using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000041using namespace arm_compute::cl_gemm;
Gian Marco05288a22017-11-21 10:57:50 +000042
Gian Marco19835e52018-01-30 13:35:54 +000043namespace
44{
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010045inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
Gian Marco19835e52018-01-30 13:35:54 +000046{
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010047 return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
Gian Marco19835e52018-01-30 13:35:54 +000048}
49} // namespace
50
Gian Marco05288a22017-11-21 10:57:50 +000051CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Georgios Pinitas72219332018-06-05 14:56:06 +010052 : _memory_group(std::move(memory_manager)),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000053 _weights_to_qasymm8(),
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010054 _mm_midgard_kernel(),
55 _mm_native_kernel(),
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010056 _mm_reshaped_only_rhs_kernel(),
Georgios Pinitas72219332018-06-05 14:56:06 +010057 _mtx_b_reshape_kernel(),
58 _mtx_a_reduction_kernel(),
59 _mtx_b_reduction_kernel(),
60 _offset_contribution_kernel(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010061 _offset_contribution_output_stage_kernel(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000062 _qasymm8_weights(),
Georgios Pinitas72219332018-06-05 14:56:06 +010063 _vector_sum_col(),
64 _vector_sum_row(),
Georgios Pinitas72219332018-06-05 14:56:06 +010065 _tmp_b(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010066 _mm_result_s32(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000067 _gemm_output_stage_multipliers(),
68 _gemm_output_stage_shifts(),
69 _matrix_a(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010070 _original_b(nullptr),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000071 _output(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010072 _a_offset(0),
73 _b_offset(0),
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000074 _is_gemm_reshaped(true),
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010075 _is_midgard(false),
Georgios Pinitas72219332018-06-05 14:56:06 +010076 _reshape_b_only_on_first_run(false),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010077 _is_prepared(false),
Michele Di Giorgiob54ba282020-01-14 15:31:55 +000078 _run_output_stage(false),
79 _convert_to_qasymm8(false),
80 _run_offset_contribution(false)
Gian Marco05288a22017-11-21 10:57:50 +000081{
82}
83
Gian Marco Iodice4b908652018-10-18 10:21:02 +010084void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
Gian Marco05288a22017-11-21 10:57:50 +000085{
Georgios Pinitas358ca202017-12-07 16:47:52 +000086 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
Gian Marco Iodice4b908652018-10-18 10:21:02 +010087 ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
Gian Marco05288a22017-11-21 10:57:50 +000088
Georgios Pinitas72219332018-06-05 14:56:06 +010089 _is_prepared = false;
90 _original_b = b;
Chunosov5124be52017-11-22 20:42:13 +070091 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010092 _a_offset = a->info()->quantization_info().uniform().offset;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000093 _matrix_a = a;
94 _output = output;
95
96 _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
97 && is_data_type_quantized_asymmetric(a->info()->data_type());
98 _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
Gian Marco05288a22017-11-21 10:57:50 +000099
Gian Marco19835e52018-01-30 13:35:54 +0000100 // Get the GPU target
101 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco7b4d5472018-01-10 15:56:30 +0000102
Gian Marco19835e52018-01-30 13:35:54 +0000103 // Set the target for the kernels
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100104 _mm_midgard_kernel.set_target(gpu_target);
105 _mm_native_kernel.set_target(gpu_target);
106 _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
Gian Marco05288a22017-11-21 10:57:50 +0000107
giuros018b6b4a92018-12-18 19:01:33 +0000108 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000109 GEMMLHSMatrixInfo lhs_info;
Gian Marco05288a22017-11-21 10:57:50 +0000110
Gian Marco19835e52018-01-30 13:35:54 +0000111 // Arguments used by GEMMReshapeInfo
112 // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
113 // in order to know how the matrices have been reshaped
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000114 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
115 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
116 const unsigned int n = b->info()->dimension(0);
117 const unsigned int k = a->info()->dimension(0);
118 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
119 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000120
121 // Check if we need to reshape the matrix A and matrix B
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100122 _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100123 _is_midgard = gpu_target == GPUTarget::MIDGARD;
Gian Marco19835e52018-01-30 13:35:54 +0000124
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000125 if(_convert_to_qasymm8)
126 {
127 // Set data type for converted weights
128 TensorInfo weights_info(*b->info());
129 weights_info.set_data_type(DataType::QASYMM8);
130 _qasymm8_weights.allocator()->init(weights_info);
131 _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
132 }
133
134 const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000135 if(_is_gemm_reshaped)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100136 {
Gian Marco05288a22017-11-21 10:57:50 +0000137 matrix_b = &_tmp_b;
138
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100139 if(!_reshape_b_only_on_first_run)
140 {
141 _memory_group.manage(&_tmp_b);
142 }
Gian Marco05288a22017-11-21 10:57:50 +0000143
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000144 // Pick up the GEMM configuration
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000145 // Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED doesn't matter, since it only affect the shape configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100146 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco05288a22017-11-21 10:57:50 +0000147
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000148 // Configure reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000149 _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
Gian Marco05288a22017-11-21 10:57:50 +0000150 }
Gian Marco05288a22017-11-21 10:57:50 +0000151
152 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
153 if(_a_offset != 0)
154 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000155 TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000156 _vector_sum_col.allocator()->init(info_vector_sum_col);
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100157 if(!_reshape_b_only_on_first_run)
158 {
159 _memory_group.manage(&_vector_sum_col);
160 }
Gian Marco05288a22017-11-21 10:57:50 +0000161
162 // Configure Matrix B reduction kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000163 _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col);
Gian Marco05288a22017-11-21 10:57:50 +0000164 }
165
166 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
167 if(_b_offset != 0)
168 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000169 TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000170 _vector_sum_row.allocator()->init(info_vector_sum_row);
171 _memory_group.manage(&_vector_sum_row);
172
173 // Configure matrix A reduction kernel
174 _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
175 }
176
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000177 GEMMKernelInfo gemm_kernel_info;
178 gemm_kernel_info.m = m;
179 gemm_kernel_info.n = n;
180 gemm_kernel_info.k = k;
181 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
182 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
183 gemm_kernel_info.lhs_info = lhs_info;
184 gemm_kernel_info.rhs_info = rhs_info;
185 gemm_kernel_info.a_offset = _a_offset;
186 gemm_kernel_info.b_offset = _b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100187 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
188 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
189 {
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100190 // Configure offset contribution kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000191 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
192
193 _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
194 _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
195
Manuel Bottini959c26d2019-12-02 16:22:35 +0000196 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
197 gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000198
199 gemm_kernel_info.output_stage = gemmlowp_output_stage;
200
201 if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
202 {
203 // Configure and tune matrix multiply kernel with fused output stage
204 _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
205 _b_offset == 0 ? nullptr : &_vector_sum_row, c, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
206 }
207 else
208 {
209 _run_output_stage = true;
210
211 _memory_group.manage(&_mm_result_s32);
212
213 if(_is_gemm_reshaped)
214 {
215 _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, gemm_kernel_info);
216 }
217 else
218 {
219 if(_is_midgard)
220 {
221 // Configure matrix multiply kernel
222 _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
223 }
224 else
225 {
226 // Pick up the GEMM configuration
227 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
228
229 // Configure matrix multiply kernel
230 _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
231 }
232 _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
233 _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
234
235 _mm_result_s32.allocator()->allocate();
236 }
237 }
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000238
239 _gemm_output_stage_multipliers.allocator()->allocate();
240 _gemm_output_stage_shifts.allocator()->allocate();
241 // Compute GEMM output multipliers and shifts for output stage
242 _gemm_output_stage_multipliers.map();
243 _gemm_output_stage_shifts.map();
244 std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
245 std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
246 _gemm_output_stage_multipliers.unmap();
247 _gemm_output_stage_shifts.unmap();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100248 }
249 else
250 {
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000251 _run_offset_contribution = true;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000252 if(_is_gemm_reshaped)
253 {
254 // Configure and tune matrix multiply kernel
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000255 _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, gemm_kernel_info);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000256 }
257 else
258 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100259 if(_is_midgard)
260 {
261 // Configure matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000262 _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100263 }
264 else
265 {
266 // Pick up the GEMM configuration
267 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
268
269 // Configure matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000270 _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100271 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000272 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100273
274 // Configure offset contribution kernel
275 _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
276 }
Gian Marco05288a22017-11-21 10:57:50 +0000277
278 // Allocate tensors
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000279 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000280 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100281 if(!_reshape_b_only_on_first_run)
282 {
283 _tmp_b.allocator()->allocate();
284 }
Gian Marco05288a22017-11-21 10:57:50 +0000285 }
286
Georgios Pinitas72219332018-06-05 14:56:06 +0100287 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marco05288a22017-11-21 10:57:50 +0000288 {
289 _vector_sum_col.allocator()->allocate();
290 }
291
292 if(_b_offset != 0)
293 {
294 _vector_sum_row.allocator()->allocate();
295 }
296}
297
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100298Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000299{
Manuel Bottini959c26d2019-12-02 16:22:35 +0000300 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
301 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
302 //DataType::QSYMM8_PER_CHANNEL supported only for weights
303 if(b->data_type() != DataType::QSYMM8_PER_CHANNEL)
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000304 {
305 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
306 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000307 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
308 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
309
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100310 int32_t a_offset = a->quantization_info().uniform().offset;
311 int32_t b_offset = b->quantization_info().uniform().offset;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000312
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100313 const ITensorInfo *matrix_a_info = a;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100314
giuros018b6b4a92018-12-18 19:01:33 +0000315 TensorInfo tmp_b_info{};
316 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000317 GEMMLHSMatrixInfo lhs_info;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100318
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000319 // Get the GPU target
320 const GPUTarget gpu_target = CLScheduler::get().target();
321
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000322 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
323 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
324 const unsigned int n = b->dimension(0);
325 const unsigned int k = a->dimension(0);
326 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
327 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100328 const bool is_midgard = gpu_target == GPUTarget::MIDGARD;
Gian Marco19835e52018-01-30 13:35:54 +0000329
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100330 bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100331
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000332 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100333
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000334 bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
335 && is_data_type_quantized_asymmetric(a->data_type());
336 TensorInfo weights_info(*b);
337 if(convert_to_qasymm8)
338 {
339 b_offset = -128;
340 weights_info.set_data_type(DataType::QASYMM8);
341 ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
342 }
343 const ITensorInfo *matrix_b_info = &weights_info;
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100344 if(reshape_matrix_b)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000345 {
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100346 matrix_b_info = &tmp_b_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000347
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000348 // Pick up the GEMM configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100349 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100350
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000351 // Validate reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000352 auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
353 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000354 }
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100355
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100356 TensorInfo info_vector_sum_col{};
357 TensorInfo info_vector_sum_row{};
Georgios Pinitas358ca202017-12-07 16:47:52 +0000358
359 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
360 if(a_offset != 0)
361 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000362 info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000363
364 // Configure Matrix B reduction kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000365 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000366 }
367
368 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
369 if(b_offset != 0)
370 {
371 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
372
373 // Configure matrix A reduction kernel
374 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
375 }
376
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000377 GEMMKernelInfo gemm_kernel_info;
378 gemm_kernel_info.m = m;
379 gemm_kernel_info.n = n;
380 gemm_kernel_info.k = k;
381 gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d;
382 gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
383 gemm_kernel_info.lhs_info = lhs_info;
384 gemm_kernel_info.rhs_info = rhs_info;
385 gemm_kernel_info.a_offset = a_offset;
386 gemm_kernel_info.b_offset = b_offset;
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100387 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
388 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000389 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
390
391 const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
392
Manuel Bottini959c26d2019-12-02 16:22:35 +0000393 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
394 gemmlowp_output_stage.output_data_type = a->data_type();
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000395
396 gemm_kernel_info.output_stage = gemmlowp_output_stage;
397 if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
398 {
399 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
400 a_offset == 0 ? nullptr : &info_vector_sum_col,
401 b_offset == 0 ? nullptr : &info_vector_sum_row,
402 c,
403 &gemm_output_stage_multipliers_shifts_info,
404 &gemm_output_stage_multipliers_shifts_info));
405 }
406 else
407 {
408 TensorInfo mm_result_s32_info{};
409
410 if(reshape_matrix_b)
411 {
412 // Output tensor auto inizialitation if not yet initialized
413 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
414
415 // Validate matrix multiply
416 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
417 }
418 else
419 {
420 // Output tensor auto inizialitation if not yet initialized
421 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
422
423 if(is_midgard)
424 {
425 // Validate matrix multiply
426 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
427 }
428 else
429 {
430 // Pick up the GEMM configuration
431 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
432
433 // Validate matrix multiply
434 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
435 }
436 }
437
438 // Validate offset contribution kernel
439 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
440 a_offset == 0 ? nullptr : &info_vector_sum_col,
441 b_offset == 0 ? nullptr : &info_vector_sum_row,
442 c,
443 output,
444 a_offset, b_offset,
445 gemmlowp_output_stage,
446 &gemm_output_stage_multipliers_shifts_info,
447 &gemm_output_stage_multipliers_shifts_info));
448 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100449 }
450 else
451 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100452 if(reshape_matrix_b)
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000453 {
454 // Validate matrix multiply
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000455 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000456 }
457 else
458 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100459 if(is_midgard)
460 {
461 // Validate matrix multiply
462 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
463 }
464 else
465 {
466 // Pick up the GEMM configuration
467 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
468
469 // Validate matrix multiply
470 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
471 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000472 }
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100473
giuros012f7c1492019-03-18 12:30:02 +0000474 if(output->total_size() != 0)
475 {
476 // Validate offset contribution kernel
477 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
478 a_offset == 0 ? nullptr : &info_vector_sum_col,
479 b_offset == 0 ? nullptr : &info_vector_sum_row,
480 c,
481 a_offset, b_offset));
482 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100483 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000484
485 return Status{};
486}
487
Gian Marco05288a22017-11-21 10:57:50 +0000488void CLGEMMLowpMatrixMultiplyCore::run()
489{
Georgios Pinitas72219332018-06-05 14:56:06 +0100490 prepare();
491
Georgios Pinitasda953f22019-04-02 17:27:03 +0100492 MemoryGroupResourceScope scope_mg(_memory_group);
Gian Marco05288a22017-11-21 10:57:50 +0000493
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000494 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000495 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100496 if(!_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700497 {
498 // Run reshape matrix B
499 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
500 }
501 }
502
Georgios Pinitas72219332018-06-05 14:56:06 +0100503 // Run matrix B reduction kernel only if _a_offset is not equal to 0
504 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700505 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100506 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
Gian Marco05288a22017-11-21 10:57:50 +0000507 }
508
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000509 // Run matrix A reduction kernel only if _b_offset is not equal to 0
510 if(_b_offset != 0)
511 {
512 CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
513 }
514
Gian Marco05288a22017-11-21 10:57:50 +0000515 // Run matrix multiply
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000516 if(_is_gemm_reshaped)
517 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100518 CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000519 }
520 else
521 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100522 if(_is_midgard)
523 {
524 CLScheduler::get().enqueue(_mm_midgard_kernel, false);
525 }
526 else
527 {
528 CLScheduler::get().enqueue(_mm_native_kernel, false);
529 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000530 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000531 if(_run_output_stage)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100532 {
533 // Run offset contribution/output stage kernel
534 CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
535 }
Michele Di Giorgiob54ba282020-01-14 15:31:55 +0000536 if(_run_offset_contribution)
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100537 {
538 // Run offset contribution kernel
539 CLScheduler::get().enqueue(_offset_contribution_kernel, true);
540 }
Georgios Pinitas72219332018-06-05 14:56:06 +0100541}
Chunosov5124be52017-11-22 20:42:13 +0700542
Georgios Pinitas72219332018-06-05 14:56:06 +0100543void CLGEMMLowpMatrixMultiplyCore::prepare()
544{
545 if(!_is_prepared)
546 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000547 if(_convert_to_qasymm8)
548 {
549 _qasymm8_weights.allocator()->allocate();
550 CLScheduler::get().enqueue(_weights_to_qasymm8, false);
551 }
552
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000553 if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
Georgios Pinitas72219332018-06-05 14:56:06 +0100554 {
555 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
556
557 // Run reshape kernel and mark original weights tensor as unused
558 _tmp_b.allocator()->allocate();
559 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
560 _original_b->mark_as_unused();
561 }
562
563 // Run matrix B reduction kernel only if _a_offset is not equal to 0
564 if(_a_offset != 0 && _reshape_b_only_on_first_run)
565 {
566 _vector_sum_col.allocator()->allocate();
567 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
568 }
569
570 CLScheduler::get().queue().finish();
571 _is_prepared = true;
572 }
Gian Marco05288a22017-11-21 10:57:50 +0000573}
Sheri Zhang0cdbda52020-02-25 15:57:21 +0000574} // namespace arm_compute