blob: 0247a39421bacfc4e64881ab29f2742f432db33c [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
Georgios Pinitase46a7be2019-02-18 15:16:14 +00002 * Copyright (c) 2017-2019 ARM Limited.
Gian Marco Iodiceab182122017-10-09 15:05:40 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
29#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010030#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
31#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010032#include "arm_compute/core/TensorInfo.h"
33#include "arm_compute/core/Types.h"
34#include "arm_compute/core/Validate.h"
Isabella Gottardie6630e42018-01-18 15:50:39 +000035#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010036#include "arm_compute/runtime/NEON/NEScheduler.h"
37#include "arm_compute/runtime/TensorAllocator.h"
38#include "support/ToolchainSupport.h"
39
40using namespace arm_compute;
Isabella Gottardie6630e42018-01-18 15:50:39 +000041using namespace arm_compute::misc::shape_calculator;
Gian Marco Iodiceab182122017-10-09 15:05:40 +010042
43NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Anthony Barbiereaefd002018-07-20 17:49:35 +010044 : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
George Wort2d7e6832019-02-22 16:37:41 +000045 _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr), _a_offset(0), _b_offset(0),
46 _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010047{
48}
49
Gian Marco Iodice4b908652018-10-18 10:21:02 +010050void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010051{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000052 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
Gian Marco Iodice4b908652018-10-18 10:21:02 +010053 ARM_COMPUTE_UNUSED(c);
54 ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
Gian Marco Iodiceab182122017-10-09 15:05:40 +010055
George Wort2d7e6832019-02-22 16:37:41 +000056 const ITensor *matrix_a = a;
57 const ITensor *matrix_b = b;
58
Georgios Pinitas72219332018-06-05 14:56:06 +010059 // Clear state
Anthony Barbier71d9b572018-07-06 17:05:59 +010060 _mtx_a_reshape_kernel = nullptr;
61 _mtx_b_reshape_kernel = nullptr;
Georgios Pinitas72219332018-06-05 14:56:06 +010062
63 // Set internal variables
Gian Marcoc7f9b892017-11-30 14:31:13 +000064 _a_offset = a->info()->quantization_info().offset;
65 _b_offset = b->info()->quantization_info().offset;
66 _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
Giorgio Arenabb54e4e2018-04-05 17:20:34 +010067 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitas72219332018-06-05 14:56:06 +010068 _is_prepared = false;
69 _original_b = b;
Gian Marcoe75a02b2017-11-08 12:24:09 +000070
George Wort2d7e6832019-02-22 16:37:41 +000071 // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
72 if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
73 {
74 _fuse_output_stage = true;
75
76 _memory_group.manage(&_mm_result_s32);
77
78 TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
79
80 _mm_result_s32.allocator()->init(info_mm_result_s32);
81 }
82
Pablo Telloeb82fd22018-02-23 13:43:50 +000083#ifdef __aarch64__
84 switch(a->info()->data_type())
Gian Marco Iodiceab182122017-10-09 15:05:40 +010085 {
Pablo Tello66c656a2018-03-15 10:34:58 +000086 case DataType::QASYMM8:
Pablo Telloeb82fd22018-02-23 13:43:50 +000087 case DataType::U8:
Anthony Barbiereaefd002018-07-20 17:49:35 +010088 case DataType::S8:
Pablo Telloeb82fd22018-02-23 13:43:50 +000089 {
George Wort2d7e6832019-02-22 16:37:41 +000090 _asm_glue.configure(a, b, _fuse_output_stage ? &_mm_result_s32 : output, 1.f, 0.f, _reshape_b_only_on_first_run);
Anthony Barbiereaefd002018-07-20 17:49:35 +010091 _dot_product_path = _asm_glue.is_configured();
Pablo Telloeb82fd22018-02-23 13:43:50 +000092 break;
93 }
94 default:
95 {
96 ARM_COMPUTE_ERROR("Datatype not supported");
97 break;
98 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +010099 }
Pablo Telloeb82fd22018-02-23 13:43:50 +0000100#endif /* __aarch64__ */
George Wort2d7e6832019-02-22 16:37:41 +0000101 if(!(_dot_product_path || _run_vector_matrix_multiplication))
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100102 {
George Wort2d7e6832019-02-22 16:37:41 +0000103 matrix_a = &_tmp_a;
104 matrix_b = &_tmp_b;
105
106 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
Georgios Pinitas02acf012019-03-19 10:49:03 +0000107 TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
George Wort2d7e6832019-02-22 16:37:41 +0000108 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
Georgios Pinitas02acf012019-03-19 10:49:03 +0000109 TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
George Wort2d7e6832019-02-22 16:37:41 +0000110 _tmp_a.allocator()->init(a_info);
111 _tmp_b.allocator()->init(b_info);
112 _memory_group.manage(&_tmp_a);
113 if(!_reshape_b_only_on_first_run)
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100114 {
George Wort2d7e6832019-02-22 16:37:41 +0000115 _memory_group.manage(&_tmp_b);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100116 }
George Wort2d7e6832019-02-22 16:37:41 +0000117
118 // Configure interleave kernel
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100119 {
George Wort2d7e6832019-02-22 16:37:41 +0000120 auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
121 k->configure(a, &_tmp_a);
122 _mtx_a_reshape_kernel = std::move(k);
123 }
Gian Marcoc7f9b892017-11-30 14:31:13 +0000124
George Wort2d7e6832019-02-22 16:37:41 +0000125 // Configure transpose kernel
126 {
127 auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
128 k->configure(b, &_tmp_b);
129 _mtx_b_reshape_kernel = std::move(k);
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100130 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000131 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100132
Gian Marcoe75a02b2017-11-08 12:24:09 +0000133 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
134 if(_a_offset != 0)
135 {
Isabella Gottardie6630e42018-01-18 15:50:39 +0000136 TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
137
Gian Marcoe75a02b2017-11-08 12:24:09 +0000138 _vector_sum_col.allocator()->init(info_vector_sum_col);
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100139 if(!_reshape_b_only_on_first_run)
140 {
141 _memory_group.manage(&_vector_sum_col);
142 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000143
144 // Configure Matrix B reduction kernel
145 _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
146 }
147
148 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
149 if(_b_offset != 0)
150 {
Isabella Gottardie6630e42018-01-18 15:50:39 +0000151 TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
152
Gian Marcoe75a02b2017-11-08 12:24:09 +0000153 _vector_sum_row.allocator()->init(info_vector_sum_row);
154 _memory_group.manage(&_vector_sum_row);
155
156 // Configure matrix A reduction kernel
157 _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
158 }
159
George Wort2d7e6832019-02-22 16:37:41 +0000160 if(_fuse_output_stage)
161 {
162 // Configure matrix multiply kernel
163 if(!_dot_product_path)
164 {
165 auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
166 k->configure(matrix_a, matrix_b, &_mm_result_s32);
167 _mm_kernel = std::move(k);
168 }
169
170 _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
171 _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
172
173 _mm_result_s32.allocator()->allocate();
174 }
175 else
176 {
177 // Configure matrix multiply kernel
178 if(!_dot_product_path)
179 {
180 auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
181 k->configure(matrix_a, matrix_b, output);
182 _mm_kernel = std::move(k);
183 }
184 // Configure offset contribution kernel
185 _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
186 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000187
188 // Allocate tensors
Gian Marcoc7f9b892017-11-30 14:31:13 +0000189 if(!_dot_product_path && !_run_vector_matrix_multiplication)
Gian Marcoe75a02b2017-11-08 12:24:09 +0000190 {
Pablo Tello6ff12a02017-11-02 16:09:35 +0000191 _tmp_a.allocator()->allocate();
Georgios Pinitas72219332018-06-05 14:56:06 +0100192 if(!_reshape_b_only_on_first_run)
193 {
194 _tmp_b.allocator()->allocate();
195 }
Pablo Tello6ff12a02017-11-02 16:09:35 +0000196 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000197
Georgios Pinitas72219332018-06-05 14:56:06 +0100198 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marcoe75a02b2017-11-08 12:24:09 +0000199 {
200 _vector_sum_col.allocator()->allocate();
201 }
202
203 if(_b_offset != 0)
204 {
205 _vector_sum_row.allocator()->allocate();
206 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100207}
208
Gian Marco Iodice4b908652018-10-18 10:21:02 +0100209Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000210{
211 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
George Wort2d7e6832019-02-22 16:37:41 +0000212 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000213 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
George Wort2d7e6832019-02-22 16:37:41 +0000214 ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000215 ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
216 "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
Chunosov5124be52017-11-22 20:42:13 +0700217 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
218 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000219
George Wort2d7e6832019-02-22 16:37:41 +0000220 const ITensorInfo *matrix_a_info = a;
221 const ITensorInfo *matrix_b_info = b;
222
223 TensorInfo tmp_a_info{};
224 TensorInfo tmp_b_info{};
225 TensorInfo mm_result_s32_info{};
226
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000227 int32_t a_offset = a->quantization_info().offset;
228 int32_t b_offset = b->quantization_info().offset;
229 const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000230
George Wort2d7e6832019-02-22 16:37:41 +0000231 bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
232 if(fuse_output_stage)
233 {
234 auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
235 }
236
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000237 // Check if we need to run the optimized assembly kernel
George Wort2d7e6832019-02-22 16:37:41 +0000238 const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, fuse_output_stage ? &mm_result_s32_info : output, 1.f, 0.f, reshape_b_only_on_first_run));
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000239
240 if(run_optimised)
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000241 {
George Wort2d7e6832019-02-22 16:37:41 +0000242 ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
243 if(gemm_info.depth_output_gemm3d() != 0)
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000244 {
George Wort2d7e6832019-02-22 16:37:41 +0000245 if(gemm_info.reinterpret_input_as_3d())
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000246 {
George Wort2d7e6832019-02-22 16:37:41 +0000247 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
248 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000249 }
250 else
251 {
George Wort2d7e6832019-02-22 16:37:41 +0000252 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000253 }
254 }
George Wort2d7e6832019-02-22 16:37:41 +0000255 else
256 {
257 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
258 }
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000259 }
260 else
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000261 {
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000262 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
263 ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
264
265 const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
266 if(!run_vector_matrix_multiplication)
267 {
George Wort2d7e6832019-02-22 16:37:41 +0000268 matrix_a_info = &tmp_a_info;
269 matrix_b_info = &tmp_b_info;
270
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000271 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
272 TensorShape shape_tmp_a = a->tensor_shape();
273 shape_tmp_a.set(0, a->dimension(0) * 4);
274 shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
275
276 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
277 TensorShape shape_tmp_b = b->tensor_shape();
278 shape_tmp_b.set(0, b->dimension(1) * 16);
279 shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
280
George Wort2d7e6832019-02-22 16:37:41 +0000281 // Validate interleave kernel
282 auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
283 auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000284
George Wort2d7e6832019-02-22 16:37:41 +0000285 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
286 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
Georgios Pinitasbb081ca2018-11-08 10:22:01 +0000287 }
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000288 }
289
290 TensorInfo info_vector_sum_col, info_vector_sum_row;
291
292 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
293 if(a_offset != 0)
294 {
Isabella Gottardie6630e42018-01-18 15:50:39 +0000295 info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000296
297 // Configure Matrix B reduction kernel
298 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
299 }
300
301 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
302 if(b_offset != 0)
303 {
Isabella Gottardie6630e42018-01-18 15:50:39 +0000304 info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000305
306 // Configure matrix A reduction kernel
307 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
308 }
309
George Wort2d7e6832019-02-22 16:37:41 +0000310 if(fuse_output_stage)
311 {
312 if(!run_optimised)
313 {
314 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
315 }
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000316
George Wort2d7e6832019-02-22 16:37:41 +0000317 // Validate offset contribution kernel
318 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
319 a_offset == 0 ? nullptr : &info_vector_sum_col,
320 b_offset == 0 ? nullptr : &info_vector_sum_row,
321 c, output, a_offset, b_offset,
322 gemm_info.gemmlowp_output_stage()));
323 }
324 else
325 {
326 if(!run_optimised)
327 {
328 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
329 }
330 // Validate offset contribution kernel
331 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
332 a_offset == 0 ? nullptr : &info_vector_sum_col,
333 b_offset == 0 ? nullptr : &info_vector_sum_row,
334 a_offset, b_offset));
335 }
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000336 return Status{};
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000337}
338
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100339void NEGEMMLowpMatrixMultiplyCore::run()
340{
Georgios Pinitas72219332018-06-05 14:56:06 +0100341 prepare();
342
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100343 _memory_group.acquire();
344
Georgios Pinitas72219332018-06-05 14:56:06 +0100345 // Reshape inputs
346 if(_mtx_a_reshape_kernel)
Pablo Tello6ff12a02017-11-02 16:09:35 +0000347 {
Georgios Pinitas72219332018-06-05 14:56:06 +0100348 NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
349 }
350 if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
351 {
352 NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
Pablo Tello6ff12a02017-11-02 16:09:35 +0000353 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100354
Georgios Pinitas72219332018-06-05 14:56:06 +0100355 // Run GEMM
Anthony Barbiereaefd002018-07-20 17:49:35 +0100356 if(_asm_glue.is_configured())
Pablo Telloeb82fd22018-02-23 13:43:50 +0000357 {
Anthony Barbiereaefd002018-07-20 17:49:35 +0100358 _asm_glue.run();
Pablo Telloeb82fd22018-02-23 13:43:50 +0000359 }
360 else
361 {
362 NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
363 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100364
Gian Marcoe75a02b2017-11-08 12:24:09 +0000365 // Run matrix A reduction kernel only if _b_offset is not equal to 0
366 if(_b_offset != 0)
367 {
368 NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
369 }
370
371 // Run matrix B reduction kernel only if _a_offset is not equal to 0
Georgios Pinitas72219332018-06-05 14:56:06 +0100372 if(_a_offset != 0 && !_reshape_b_only_on_first_run)
Gian Marcoe75a02b2017-11-08 12:24:09 +0000373 {
374 NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
375 }
376
George Wort2d7e6832019-02-22 16:37:41 +0000377 if(_fuse_output_stage)
378 {
379 // Run offset contribution kernel
380 NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
381 }
382 else
383 {
384 // Run offset contribution kernel
385 NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
386 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000387
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100388 _memory_group.release();
Georgios Pinitas72219332018-06-05 14:56:06 +0100389}
Giorgio Arenabb54e4e2018-04-05 17:20:34 +0100390
Georgios Pinitas72219332018-06-05 14:56:06 +0100391void NEGEMMLowpMatrixMultiplyCore::prepare()
392{
393 if(!_is_prepared)
394 {
395 // Run assembly reshape
Anthony Barbiereaefd002018-07-20 17:49:35 +0100396 if(_asm_glue.is_configured() && _reshape_b_only_on_first_run)
Georgios Pinitas72219332018-06-05 14:56:06 +0100397 {
398 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
399
Anthony Barbiereaefd002018-07-20 17:49:35 +0100400 _asm_glue.prepare();
Georgios Pinitas72219332018-06-05 14:56:06 +0100401 _original_b->mark_as_unused();
402 }
403 // Run non-assembly reshape
404 else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
405 {
406 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
407
408 // Run reshape kernel and mark original weights tensor as unused
409 _tmp_b.allocator()->allocate();
410 NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
411 _original_b->mark_as_unused();
412 }
413
414 // Run matrix B reduction kernel only if _a_offset is not equal to 0
415 if(_a_offset != 0 && _reshape_b_only_on_first_run)
416 {
417 _vector_sum_col.allocator()->allocate();
418 NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
419 }
420
421 _is_prepared = true;
422 }
Pablo Tello6ff12a02017-11-02 16:09:35 +0000423}