blob: cdb78c291d9a85c2a81e5079baaee077cb6cbfc3 [file] [log] [blame]
Gian Marco05288a22017-11-21 10:57:50 +00001/*
Manuel Bottini959c26d2019-12-02 16:22:35 +00002 * Copyright (c) 2017-2020 ARM Limited.
Gian Marco05288a22017-11-21 10:57:50 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/CL/ICLTensor.h"
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010027#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010028#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
Gian Marco05288a22017-11-21 10:57:50 +000029#include "arm_compute/core/Error.h"
30#include "arm_compute/core/Helpers.h"
31#include "arm_compute/core/TensorInfo.h"
32#include "arm_compute/core/Types.h"
33#include "arm_compute/core/Validate.h"
Georgios Pinitas358ca202017-12-07 16:47:52 +000034#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000035#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
Gian Marco05288a22017-11-21 10:57:50 +000036#include "arm_compute/runtime/CL/CLScheduler.h"
37
giuros011c9efeb2019-01-11 14:04:43 +000038namespace arm_compute
39{
Georgios Pinitas358ca202017-12-07 16:47:52 +000040using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000041using namespace arm_compute::cl_gemm;
Gian Marco05288a22017-11-21 10:57:50 +000042
Gian Marco19835e52018-01-30 13:35:54 +000043namespace
44{
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010045inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
Gian Marco19835e52018-01-30 13:35:54 +000046{
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010047 return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
Gian Marco19835e52018-01-30 13:35:54 +000048}
49} // namespace
50
Gian Marco05288a22017-11-21 10:57:50 +000051CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Georgios Pinitas72219332018-06-05 14:56:06 +010052 : _memory_group(std::move(memory_manager)),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000053 _weights_to_qasymm8(),
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010054 _mm_midgard_kernel(),
55 _mm_native_kernel(),
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +010056 _mm_reshaped_only_rhs_kernel(),
Georgios Pinitas72219332018-06-05 14:56:06 +010057 _mtx_b_reshape_kernel(),
58 _mtx_a_reduction_kernel(),
59 _mtx_b_reduction_kernel(),
60 _offset_contribution_kernel(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010061 _offset_contribution_output_stage_kernel(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000062 _qasymm8_weights(),
Georgios Pinitas72219332018-06-05 14:56:06 +010063 _vector_sum_col(),
64 _vector_sum_row(),
Georgios Pinitas72219332018-06-05 14:56:06 +010065 _tmp_b(),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010066 _mm_result_s32(),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000067 _gemm_output_stage_multipliers(),
68 _gemm_output_stage_shifts(),
69 _matrix_a(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010070 _original_b(nullptr),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000071 _output(nullptr),
Georgios Pinitas72219332018-06-05 14:56:06 +010072 _a_offset(0),
73 _b_offset(0),
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +000074 _is_gemm_reshaped(true),
Gian Marco Iodice06be6f82019-06-24 17:47:51 +010075 _is_midgard(false),
Georgios Pinitas72219332018-06-05 14:56:06 +010076 _reshape_b_only_on_first_run(false),
Gian Marco Iodice4b908652018-10-18 10:21:02 +010077 _is_prepared(false),
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000078 _fuse_output_stage(false),
79 _convert_to_qasymm8(false)
Gian Marco05288a22017-11-21 10:57:50 +000080{
81}
82
Gian Marco Iodice4b908652018-10-18 10:21:02 +010083void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
Gian Marco05288a22017-11-21 10:57:50 +000084{
Georgios Pinitas358ca202017-12-07 16:47:52 +000085 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
Gian Marco Iodice4b908652018-10-18 10:21:02 +010086 ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
Gian Marco05288a22017-11-21 10:57:50 +000087
Georgios Pinitas72219332018-06-05 14:56:06 +010088 _is_prepared = false;
89 _original_b = b;
Chunosov5124be52017-11-22 20:42:13 +070090 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitas4c5469b2019-05-21 13:32:43 +010091 _a_offset = a->info()->quantization_info().uniform().offset;
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +000092 _matrix_a = a;
93 _output = output;
94
95 _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
96 && is_data_type_quantized_asymmetric(a->info()->data_type());
97 _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
Gian Marco05288a22017-11-21 10:57:50 +000098
Gian Marco19835e52018-01-30 13:35:54 +000099 // Get the GPU target
100 const GPUTarget gpu_target = CLScheduler::get().target();
Gian Marco7b4d5472018-01-10 15:56:30 +0000101
Gian Marco19835e52018-01-30 13:35:54 +0000102 // Set the target for the kernels
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100103 _mm_midgard_kernel.set_target(gpu_target);
104 _mm_native_kernel.set_target(gpu_target);
105 _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
Gian Marco05288a22017-11-21 10:57:50 +0000106
giuros018b6b4a92018-12-18 19:01:33 +0000107 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000108 GEMMLHSMatrixInfo lhs_info;
Gian Marco05288a22017-11-21 10:57:50 +0000109
Gian Marco19835e52018-01-30 13:35:54 +0000110 // Arguments used by GEMMReshapeInfo
111 // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
112 // in order to know how the matrices have been reshaped
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000113 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
114 const unsigned int m = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
115 const unsigned int n = b->info()->dimension(0);
116 const unsigned int k = a->info()->dimension(0);
117 const unsigned int batch_size = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
118 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco19835e52018-01-30 13:35:54 +0000119
120 // Check if we need to reshape the matrix A and matrix B
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100121 _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100122 _is_midgard = gpu_target == GPUTarget::MIDGARD;
Gian Marco19835e52018-01-30 13:35:54 +0000123
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000124 if(_convert_to_qasymm8)
125 {
126 // Set data type for converted weights
127 TensorInfo weights_info(*b->info());
128 weights_info.set_data_type(DataType::QASYMM8);
129 _qasymm8_weights.allocator()->init(weights_info);
130 _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0);
131 }
132
133 const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000134 if(_is_gemm_reshaped)
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100135 {
Gian Marco05288a22017-11-21 10:57:50 +0000136 matrix_b = &_tmp_b;
137
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100138 if(!_reshape_b_only_on_first_run)
139 {
140 _memory_group.manage(&_tmp_b);
141 }
Gian Marco05288a22017-11-21 10:57:50 +0000142
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000143 // Pick up the GEMM configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100144 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Gian Marco05288a22017-11-21 10:57:50 +0000145
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000146 // Configure reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000147 _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
Gian Marco05288a22017-11-21 10:57:50 +0000148 }
Gian Marco05288a22017-11-21 10:57:50 +0000149
150 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
151 if(_a_offset != 0)
152 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000153 TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000154 _vector_sum_col.allocator()->init(info_vector_sum_col);
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100155 if(!_reshape_b_only_on_first_run)
156 {
157 _memory_group.manage(&_vector_sum_col);
158 }
Gian Marco05288a22017-11-21 10:57:50 +0000159
160 // Configure Matrix B reduction kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000161 _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col);
Gian Marco05288a22017-11-21 10:57:50 +0000162 }
163
164 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
165 if(_b_offset != 0)
166 {
Georgios Pinitas358ca202017-12-07 16:47:52 +0000167 TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
Gian Marco05288a22017-11-21 10:57:50 +0000168 _vector_sum_row.allocator()->init(info_vector_sum_row);
169 _memory_group.manage(&_vector_sum_row);
170
171 // Configure matrix A reduction kernel
172 _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
173 }
174
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100175 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
176 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
177 {
178 _fuse_output_stage = true;
179
180 _memory_group.manage(&_mm_result_s32);
181
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000182 if(_is_gemm_reshaped)
183 {
184 // Configure and tune matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000185 _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000186 }
187 else
188 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100189 if(_is_midgard)
190 {
191 // Configure matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000192 _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100193 }
194 else
195 {
196 // Pick up the GEMM configuration
197 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
198
199 // Configure matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000200 _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100201 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000202 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100203 // Configure offset contribution kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000204 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
205
206 _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
207 _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
208
Manuel Bottini959c26d2019-12-02 16:22:35 +0000209 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
210 gemmlowp_output_stage.output_data_type = _matrix_a->info()->data_type();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100211 _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
Manuel Bottini959c26d2019-12-02 16:22:35 +0000212 _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000213
214 _gemm_output_stage_multipliers.allocator()->allocate();
215 _gemm_output_stage_shifts.allocator()->allocate();
216 // Compute GEMM output multipliers and shifts for output stage
217 _gemm_output_stage_multipliers.map();
218 _gemm_output_stage_shifts.map();
219 std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
220 std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
221 _gemm_output_stage_multipliers.unmap();
222 _gemm_output_stage_shifts.unmap();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100223
224 _mm_result_s32.allocator()->allocate();
225 }
226 else
227 {
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000228 if(_is_gemm_reshaped)
229 {
230 // Configure and tune matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000231 _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000232 }
233 else
234 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100235 if(_is_midgard)
236 {
237 // Configure matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000238 _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100239 }
240 else
241 {
242 // Pick up the GEMM configuration
243 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
244
245 // Configure matrix multiply kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000246 _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100247 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000248 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100249
250 // Configure offset contribution kernel
251 _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
252 }
Gian Marco05288a22017-11-21 10:57:50 +0000253
254 // Allocate tensors
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000255 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000256 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100257 if(!_reshape_b_only_on_first_run)
258 {
259 _tmp_b.allocator()->allocate();
260 }
Gian Marco05288a22017-11-21 10:57:50 +0000261 }
262
Georgios Pinitas72219332018-06-05 14:56:06 +0100263 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marco05288a22017-11-21 10:57:50 +0000264 {
265 _vector_sum_col.allocator()->allocate();
266 }
267
268 if(_b_offset != 0)
269 {
270 _vector_sum_row.allocator()->allocate();
271 }
272}
273
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100274Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000275{
Manuel Bottini959c26d2019-12-02 16:22:35 +0000276 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
277 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
278 //DataType::QSYMM8_PER_CHANNEL supported only for weights
279 if(b->data_type() != DataType::QSYMM8_PER_CHANNEL)
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000280 {
281 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
282 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000283 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
284 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
285
Georgios Pinitas4c5469b2019-05-21 13:32:43 +0100286 int32_t a_offset = a->quantization_info().uniform().offset;
287 int32_t b_offset = b->quantization_info().uniform().offset;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000288
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100289 const ITensorInfo *matrix_a_info = a;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100290
giuros018b6b4a92018-12-18 19:01:33 +0000291 TensorInfo tmp_b_info{};
292 GEMMRHSMatrixInfo rhs_info;
giuros011c9efeb2019-01-11 14:04:43 +0000293 GEMMLHSMatrixInfo lhs_info;
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100294
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000295 // Get the GPU target
296 const GPUTarget gpu_target = CLScheduler::get().target();
297
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000298 bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
299 const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
300 const unsigned int n = b->dimension(0);
301 const unsigned int k = a->dimension(0);
302 const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
303 const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100304 const bool is_midgard = gpu_target == GPUTarget::MIDGARD;
Gian Marco19835e52018-01-30 13:35:54 +0000305
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100306 bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100307
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000308 const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
Georgios Pinitasebf6b8a2018-09-24 16:31:08 +0100309
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000310 bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
311 && is_data_type_quantized_asymmetric(a->data_type());
312 TensorInfo weights_info(*b);
313 if(convert_to_qasymm8)
314 {
315 b_offset = -128;
316 weights_info.set_data_type(DataType::QASYMM8);
317 ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0));
318 }
319 const ITensorInfo *matrix_b_info = &weights_info;
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100320 if(reshape_matrix_b)
Georgios Pinitas358ca202017-12-07 16:47:52 +0000321 {
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100322 matrix_b_info = &tmp_b_info;
Georgios Pinitas358ca202017-12-07 16:47:52 +0000323
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000324 // Pick up the GEMM configuration
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100325 std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100326
Gian Marco Iodice926afe12019-03-19 11:44:13 +0000327 // Validate reshape RHS kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000328 auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
329 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000330 }
Isabella Gottardic4f582e2018-10-11 19:14:55 +0100331
Michalis Spyroua4f378d2019-04-26 14:54:54 +0100332 TensorInfo info_vector_sum_col{};
333 TensorInfo info_vector_sum_row{};
Georgios Pinitas358ca202017-12-07 16:47:52 +0000334
335 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
336 if(a_offset != 0)
337 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000338 info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
Georgios Pinitas358ca202017-12-07 16:47:52 +0000339
340 // Configure Matrix B reduction kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000341 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col));
Georgios Pinitas358ca202017-12-07 16:47:52 +0000342 }
343
344 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
345 if(b_offset != 0)
346 {
347 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
348
349 // Configure matrix A reduction kernel
350 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
351 }
352
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100353 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
354 {
355 TensorInfo mm_result_s32_info{};
356
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100357 if(reshape_matrix_b)
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000358 {
359 // Output tensor auto inizialitation if not yet initialized
360 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100361
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000362 // Validate matrix multiply
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100363 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000364 }
365 else
366 {
367 // Output tensor auto inizialitation if not yet initialized
368 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100369
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100370 if(is_midgard)
371 {
372 // Validate matrix multiply
373 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
374 }
375 else
376 {
377 // Pick up the GEMM configuration
378 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
379
380 // Validate matrix multiply
381 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
382 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000383 }
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100384
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100385 // Validate offset contribution kernel
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000386 const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
387
388 const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
389
Manuel Bottini959c26d2019-12-02 16:22:35 +0000390 GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
391 gemmlowp_output_stage.output_data_type = a->data_type();
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100392 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
393 a_offset == 0 ? nullptr : &info_vector_sum_col,
394 b_offset == 0 ? nullptr : &info_vector_sum_row,
395 c,
396 output,
397 a_offset, b_offset,
Manuel Bottini959c26d2019-12-02 16:22:35 +0000398 gemmlowp_output_stage,
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000399 &gemm_output_stage_multipliers_shifts_info,
400 &gemm_output_stage_multipliers_shifts_info));
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100401 }
402 else
403 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100404 if(reshape_matrix_b)
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000405 {
406 // Validate matrix multiply
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100407 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000408 }
409 else
410 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100411 if(is_midgard)
412 {
413 // Validate matrix multiply
414 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
415 }
416 else
417 {
418 // Pick up the GEMM configuration
419 std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
420
421 // Validate matrix multiply
422 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
423 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000424 }
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100425
giuros012f7c1492019-03-18 12:30:02 +0000426 if(output->total_size() != 0)
427 {
428 // Validate offset contribution kernel
429 ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
430 a_offset == 0 ? nullptr : &info_vector_sum_col,
431 b_offset == 0 ? nullptr : &info_vector_sum_row,
432 c,
433 a_offset, b_offset));
434 }
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100435 }
Georgios Pinitas358ca202017-12-07 16:47:52 +0000436
437 return Status{};
438}
439
Gian Marco05288a22017-11-21 10:57:50 +0000440void CLGEMMLowpMatrixMultiplyCore::run()
441{
Georgios Pinitas72219332018-06-05 14:56:06 +0100442 prepare();
443
Georgios Pinitasda953f22019-04-02 17:27:03 +0100444 MemoryGroupResourceScope scope_mg(_memory_group);
Gian Marco05288a22017-11-21 10:57:50 +0000445
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000446 if(_is_gemm_reshaped)
Gian Marco05288a22017-11-21 10:57:50 +0000447 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100448 if(!_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700449 {
450 // Run reshape matrix B
451 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
452 }
453 }
454
Georgios Pinitas72219332018-06-05 14:56:06 +0100455 // Run matrix B reduction kernel only if _a_offset is not equal to 0
456 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Chunosov5124be52017-11-22 20:42:13 +0700457 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100458 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
Gian Marco05288a22017-11-21 10:57:50 +0000459 }
460
461 // Run matrix multiply
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000462 if(_is_gemm_reshaped)
463 {
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100464 CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, false);
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000465 }
466 else
467 {
Gian Marco Iodice06be6f82019-06-24 17:47:51 +0100468 if(_is_midgard)
469 {
470 CLScheduler::get().enqueue(_mm_midgard_kernel, false);
471 }
472 else
473 {
474 CLScheduler::get().enqueue(_mm_native_kernel, false);
475 }
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000476 }
Gian Marco05288a22017-11-21 10:57:50 +0000477
478 // Run matrix A reduction kernel only if _b_offset is not equal to 0
479 if(_b_offset != 0)
480 {
481 CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
482 }
483
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100484 if(_fuse_output_stage)
485 {
486 // Run offset contribution/output stage kernel
487 CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
488 }
489 else
490 {
491 // Run offset contribution kernel
492 CLScheduler::get().enqueue(_offset_contribution_kernel, true);
493 }
Georgios Pinitas72219332018-06-05 14:56:06 +0100494}
Chunosov5124be52017-11-22 20:42:13 +0700495
Georgios Pinitas72219332018-06-05 14:56:06 +0100496void CLGEMMLowpMatrixMultiplyCore::prepare()
497{
498 if(!_is_prepared)
499 {
Vidhya Sudhan Loganathan951b8a42019-11-04 14:42:08 +0000500 if(_convert_to_qasymm8)
501 {
502 _qasymm8_weights.allocator()->allocate();
503 CLScheduler::get().enqueue(_weights_to_qasymm8, false);
504 }
505
Gian Marco Iodicedb63b9c2019-01-17 09:47:04 +0000506 if(_is_gemm_reshaped && _reshape_b_only_on_first_run)
Georgios Pinitas72219332018-06-05 14:56:06 +0100507 {
508 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
509
510 // Run reshape kernel and mark original weights tensor as unused
511 _tmp_b.allocator()->allocate();
512 CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
513 _original_b->mark_as_unused();
514 }
515
516 // Run matrix B reduction kernel only if _a_offset is not equal to 0
517 if(_a_offset != 0 && _reshape_b_only_on_first_run)
518 {
519 _vector_sum_col.allocator()->allocate();
520 CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
521 }
522
523 CLScheduler::get().queue().finish();
524 _is_prepared = true;
525 }
Gian Marco05288a22017-11-21 10:57:50 +0000526}
Gian Marco Iodice2ec6c1e2019-04-09 12:03:05 +0100527} // namespace arm_compute