blob: da5ac22fdce55464c1d26cbdd51689f7277efed1 [file] [log] [blame]
Gian Marco Iodiceab182122017-10-09 15:05:40 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
Pablo Tello6ff12a02017-11-02 16:09:35 +000029#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010030#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
Gian Marco Iodiceab182122017-10-09 15:05:40 +010031#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
32#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
33#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
34#include "arm_compute/core/TensorInfo.h"
35#include "arm_compute/core/Types.h"
36#include "arm_compute/core/Validate.h"
37#include "arm_compute/runtime/NEON/NEScheduler.h"
38#include "arm_compute/runtime/TensorAllocator.h"
39#include "support/ToolchainSupport.h"
40
Pablo Tello6ff12a02017-11-02 16:09:35 +000041namespace arm_compute
42{
43#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
Pablo Tello6681d242017-11-13 16:44:08 +000044#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
Pablo Tello6ff12a02017-11-02 16:09:35 +000045} // namespace arm_compute
46
Gian Marco Iodiceab182122017-10-09 15:05:40 +010047using namespace arm_compute;
48
49NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
Gian Marcoe75a02b2017-11-08 12:24:09 +000050 : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
51 _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0)
Gian Marco Iodiceab182122017-10-09 15:05:40 +010052{
53}
54
55void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)
56{
Georgios Pinitasa3b1b462017-11-16 19:24:39 +000057 ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
58 ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info()));
Gian Marco Iodiceab182122017-10-09 15:05:40 +010059
Gian Marcoe75a02b2017-11-08 12:24:09 +000060 bool dot_product_path = false;
61
62 _a_offset = a->info()->quantization_info().offset;
63 _b_offset = b->info()->quantization_info().offset;
64
Gian Marco Iodiceab182122017-10-09 15:05:40 +010065#ifdef ARM_COMPUTE_AARCH64_V8_2
66 // Check for DOT product instruction
67 const struct CPUInfo ci = NEScheduler::get().cpu_info();
68 const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
69
70 if(cpu_has_dotprod != 0)
71 {
Gian Marcoe75a02b2017-11-08 12:24:09 +000072 dot_product_path = true;
73
Gian Marco Iodiceab182122017-10-09 15:05:40 +010074 // Configure matrix multiply kernel
Pablo Tello6ff12a02017-11-02 16:09:35 +000075 struct CPUInfo ci = NEScheduler::get().cpu_info();
76 const int M = output->info()->tensor_shape().y();
77 const int N = output->info()->tensor_shape().x();
78 const int K = a->info()->tensor_shape().x();
79
Pablo Tello6681d242017-11-13 16:44:08 +000080 const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
81 constexpr size_t alignment = 4096;
82 _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
Pablo Tello6ff12a02017-11-02 16:09:35 +000083 _memory_group.manage(&_workspace);
Gian Marcoe75a02b2017-11-08 12:24:09 +000084
Pablo Tello6ff12a02017-11-02 16:09:35 +000085 // Configure matrix multiplication kernel
86 auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
87 k->configure(a, b, output, &_workspace, 1.f, 1.f);
88 _mm_kernel = std::move(k);
Gian Marco Iodiceab182122017-10-09 15:05:40 +010089 }
90 else
91#endif /* ARM_COMPUTE_AARCH64_V8_2 */
92 {
93 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
94 TensorShape shape_tmp_a = a->info()->tensor_shape();
95 shape_tmp_a.set(0, a->info()->dimension(0) * 4);
96 shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
97
98 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
99 TensorShape shape_tmp_b = b->info()->tensor_shape();
100 shape_tmp_b.set(0, b->info()->dimension(1) * 16);
101 shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
102
103 TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
104 TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
105 _tmp_a.allocator()->init(info_a);
106 _tmp_b.allocator()->init(info_b);
107 _memory_group.manage(&_tmp_a);
108 _memory_group.manage(&_tmp_b);
109
110 // Configure interleave kernel
111 {
112 auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
113 k->configure(a, &_tmp_a);
114 _mtx_a_reshape_kernel = std::move(k);
115 }
116
117 // Configure transpose kernel
118 {
119 auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
120 k->configure(b, &_tmp_b);
121 _mtx_b_reshape_kernel = std::move(k);
122 }
123
124 // Configure matrix multiply kernel
125 {
126 auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
127 k->configure(&_tmp_a, &_tmp_b, output);
128 _mm_kernel = std::move(k);
129 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000130 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100131
Gian Marcoe75a02b2017-11-08 12:24:09 +0000132 // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
133 if(_a_offset != 0)
134 {
135 TensorShape shape_vector_sum_col = b->info()->tensor_shape();
Gian Marco05288a22017-11-21 10:57:50 +0000136 if(b->info()->num_dimensions() > 1)
137 {
138 shape_vector_sum_col.remove_dimension(1);
139 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000140 TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
141 _vector_sum_col.allocator()->init(info_vector_sum_col);
142 _memory_group.manage(&_vector_sum_col);
143
144 // Configure Matrix B reduction kernel
145 _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
146 }
147
148 // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
149 if(_b_offset != 0)
150 {
151 TensorShape shape_vector_sum_row = a->info()->tensor_shape();
152 shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
Gian Marco05288a22017-11-21 10:57:50 +0000153 if(a->info()->num_dimensions() > 1)
154 {
155 shape_vector_sum_row.remove_dimension(1);
156 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000157 TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
158 _vector_sum_row.allocator()->init(info_vector_sum_row);
159 _memory_group.manage(&_vector_sum_row);
160
161 // Configure matrix A reduction kernel
162 _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
163 }
164
165 // Configure offset contribution kernel
166 _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
167
168 // Allocate tensors
169 if(!dot_product_path)
170 {
Pablo Tello6ff12a02017-11-02 16:09:35 +0000171 _tmp_a.allocator()->allocate();
172 _tmp_b.allocator()->allocate();
173 }
Gian Marcoe75a02b2017-11-08 12:24:09 +0000174 else
175 {
176 _workspace.allocator()->allocate();
177 }
178
179 if(_a_offset != 0)
180 {
181 _vector_sum_col.allocator()->allocate();
182 }
183
184 if(_b_offset != 0)
185 {
186 _vector_sum_row.allocator()->allocate();
187 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100188}
189
Georgios Pinitasa3b1b462017-11-16 19:24:39 +0000190Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output)
191{
192 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
193 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
194 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
195 ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
196 "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
197 ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
198 "The output matrix must have the same number of rows as the matrix A");
199 ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
200 "The output matrix must have the same number of columns as the matrix B");
201
202 int32_t a_offset = a->quantization_info().offset;
203 int32_t b_offset = b->quantization_info().offset;
204
205#ifdef ARM_COMPUTE_AARCH64_V8_2
206 // Check for DOT product instruction
207 const struct CPUInfo ci = NEScheduler::get().cpu_info();
208 const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
209
210 if(cpu_has_dotprod != 0)
211 {
212 // Validate matrix multiply kernel
213 ARM_COMPUTE_RETURN_ERROR_ON(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
214 }
215 else
216#endif /* ARM_COMPUTE_AARCH64_V8_2 */
217 {
218 // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
219 TensorShape shape_tmp_a = a->tensor_shape();
220 shape_tmp_a.set(0, a->dimension(0) * 4);
221 shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
222
223 // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
224 TensorShape shape_tmp_b = b->tensor_shape();
225 shape_tmp_b.set(0, b->dimension(1) * 16);
226 shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
227
228 TensorInfo info_a(shape_tmp_a, 1, a->data_type());
229 TensorInfo info_b(shape_tmp_b, 1, b->data_type());
230
231 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
232 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
233 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
234 }
235
236 TensorInfo info_vector_sum_col, info_vector_sum_row;
237
238 // Validate matrix B reduction kernel only if _a_offset is not equal to 0
239 if(a_offset != 0)
240 {
241 TensorShape shape_vector_sum_col = b->tensor_shape();
242 shape_vector_sum_col.remove_dimension(1);
243 info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);
244
245 // Configure Matrix B reduction kernel
246 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
247 }
248
249 // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
250 if(b_offset != 0)
251 {
252 TensorShape shape_vector_sum_row = a->tensor_shape();
253 shape_vector_sum_row.set(Window::DimX, a->dimension(1));
254 shape_vector_sum_row.remove_dimension(1);
255 info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);
256
257 // Configure matrix A reduction kernel
258 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
259 }
260
261 // Validate offset contribution kernel
262 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
263 a_offset == 0 ? nullptr : &info_vector_sum_col,
264 b_offset == 0 ? nullptr : &info_vector_sum_row,
265 a_offset, b_offset));
266
267 return Error{};
268}
269
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100270void NEGEMMLowpMatrixMultiplyCore::run()
271{
272 _memory_group.acquire();
273
Pablo Tello6ff12a02017-11-02 16:09:35 +0000274 if(_mtx_a_reshape_kernel)
275 {
276 NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
277 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100278
Pablo Tello6ff12a02017-11-02 16:09:35 +0000279 if(_mtx_b_reshape_kernel)
280 {
281 NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
282 }
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100283
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100284 NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
285
Gian Marcoe75a02b2017-11-08 12:24:09 +0000286 // Run matrix A reduction kernel only if _b_offset is not equal to 0
287 if(_b_offset != 0)
288 {
289 NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
290 }
291
292 // Run matrix B reduction kernel only if _a_offset is not equal to 0
293 if(_a_offset != 0)
294 {
295 NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
296 }
297
298 // Run offset contribution kernel
299 NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
300
Gian Marco Iodiceab182122017-10-09 15:05:40 +0100301 _memory_group.release();
Pablo Tello6ff12a02017-11-02 16:09:35 +0000302}