blob: 48a0d2af1ce61dbf57b4e7f68e7e1e43adb820ed [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Anthony Barbierf1df3462018-01-31 09:13:37 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
25
26#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
Moritz Pflanzer80373f62017-09-15 10:42:58 +010029#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010030#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
Michele Di Giorgio5b6904b2018-01-29 12:24:14 +000031#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
Pablo Tello4d55e0a2017-11-10 15:57:14 +000032#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include "arm_compute/core/TensorInfo.h"
34#include "arm_compute/core/Types.h"
35#include "arm_compute/core/Validate.h"
36#include "arm_compute/runtime/NEON/NEScheduler.h"
37#include "arm_compute/runtime/TensorAllocator.h"
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010038#include "support/ToolchainSupport.h"
39
40namespace arm_compute
41{
Anthony Barbierf1df3462018-01-31 09:13:37 +000042#pragma GCC diagnostic push
43#pragma GCC diagnostic ignored "-Wswitch-default"
Michele Di Giorgio5b6904b2018-01-29 12:24:14 +000044#pragma GCC diagnostic ignored "-Weffc++"
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010045#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
Michele Di Giorgio5b6904b2018-01-29 12:24:14 +000046#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
Moritz Pflanzer80373f62017-09-15 10:42:58 +010047#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
Pablo Tello4d55e0a2017-11-10 15:57:14 +000048#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010049#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
Michele Di Giorgio5b6904b2018-01-29 12:24:14 +000050#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
Anthony Barbierf1df3462018-01-31 09:13:37 +000051#pragma GCC diagnostic pop
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010052} // namespace arm_compute
Anthony Barbier6ff3b192017-09-04 18:44:23 +010053
54#include <cmath>
55
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010056namespace arm_compute
57{
Georgios Pinitas658039b2017-09-15 16:30:50 +010058NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010059 : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
Gian Marco1d25ed52017-12-16 19:33:50 +000060 _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010061{
62}
63
Gian Marco1d25ed52017-12-16 19:33:50 +000064void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010065{
Gian Marco Iodicebdb6b0b2017-06-30 12:21:00 +010066 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
Gian Marco Iodice3a3066b2017-06-23 13:38:14 +010067 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
Gian Marco Iodicebdb6b0b2017-06-30 12:21:00 +010068 ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
Gian Marco1d25ed52017-12-16 19:33:50 +000069 ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
70 ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071
72 if(c != nullptr)
73 {
Gian Marco Iodicebdb6b0b2017-06-30 12:21:00 +010074 ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010075 ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
76 ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
77 ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
78 ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
79 ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
80 }
81
Gian Marco1d25ed52017-12-16 19:33:50 +000082 // Check if we need to reshape the matrix B only on the first run
83 _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010084 _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010085
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +010086 // Check if the first input tensor is a vector.
87 // If so, all the kernels for reshaping the tensors can be skipped
88 if(_run_vector_matrix_multiplication)
89 {
Michele Di Giorgio5b6904b2018-01-29 12:24:14 +000090#if defined(__aarch64__)
91 if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
92 {
93 _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
94 }
95
96 if(_mm_optimised_kernel != nullptr)
97 {
98 struct CPUInfo ci = NEScheduler::get().cpu_info();
99
100 const int N = d->info()->tensor_shape().x();
101 const int K = a->info()->tensor_shape().x();
102
103 size_t workbench_size = 0;
104
105 if(a->info()->data_type() == DataType::F32)
106 {
107 workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
108 }
109
110 constexpr size_t alignment = 4096;
111 ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
112 _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
113 _memory_group.manage(&_workspace);
114
115 // Configure matrix multiplication kernel
116 _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
117 _workspace.allocator()->allocate();
118 }
119 else
120#endif /* defined(__aarch64__) */
121 {
122 // Configure the matrix multiply kernel
123 _mm_kernel.configure(a, b, d, alpha);
124 }
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100125
126 // Configure matrix addition kernel
127 if(beta != 0 && c != nullptr)
128 {
129 _ma_kernel.configure(c, d, beta);
130 _run_addition = true;
131 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100132 }
133 else
134 {
Moritz Pflanzer80373f62017-09-15 10:42:58 +0100135#if defined(__arm__)
136 if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
137 {
138 _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
139 }
140#elif defined(__aarch64__)
141 if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
142 {
143 _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
144 }
Pablo Tello4d55e0a2017-11-10 15:57:14 +0000145 else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
146 {
147#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
148 _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
149#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
150 ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
151#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
152 }
Moritz Pflanzer80373f62017-09-15 10:42:58 +0100153#endif /* defined(__arm__) || defined(__aarch64__) */
154
155#if defined(__arm__) || defined(__aarch64__)
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100156 if(_mm_optimised_kernel != nullptr)
157 {
158 struct CPUInfo ci = NEScheduler::get().cpu_info();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100159
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100160 const int M = d->info()->tensor_shape().y();
161 const int N = d->info()->tensor_shape().x();
162 const int K = a->info()->tensor_shape().x();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100163
Pablo Tello4d55e0a2017-11-10 15:57:14 +0000164 size_t workbench_size = 0;
165
Moritz Pflanzer80373f62017-09-15 10:42:58 +0100166#if defined(__arm__)
Pablo Tello4d55e0a2017-11-10 15:57:14 +0000167 workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
Moritz Pflanzer80373f62017-09-15 10:42:58 +0100168#elif defined(__aarch64__)
Pablo Tello4d55e0a2017-11-10 15:57:14 +0000169 if(a->info()->data_type() == DataType::F32)
170 {
171 workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
172 }
173 else if(a->info()->data_type() == DataType::F16)
174 {
175#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
176 workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
177#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
178 ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
179#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
180 }
Moritz Pflanzer80373f62017-09-15 10:42:58 +0100181#endif /* defined(__arm__) || defined(__aarch64__) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100182
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100183 constexpr size_t alignment = 4096;
Pablo Tello4d55e0a2017-11-10 15:57:14 +0000184 ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
185 _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100186 _memory_group.manage(&_workspace);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100187
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100188 // Configure matrix multiplication kernel
Georgios Pinitas08c5a062017-12-14 17:53:39 +0000189 _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100190 _workspace.allocator()->allocate();
191 }
192 else
Moritz Pflanzer80373f62017-09-15 10:42:58 +0100193#endif /* defined(__arm__) || defined(__aarch64__) */
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100194 {
195 TensorShape shape_tmp_a = a->info()->tensor_shape();
196 TensorShape shape_tmp_b = b->info()->tensor_shape();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100197
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100198 shape_tmp_a.set(0, a->info()->dimension(0) * 4);
199 shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
Georgios Pinitas658039b2017-09-15 16:30:50 +0100200
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100201 const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
202 shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
203 shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100205 TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
206 TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100207
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100208 _tmp_a.allocator()->init(info_a);
209 _tmp_b.allocator()->init(info_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100210
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100211 // Manage intermediate buffers
212 _memory_group.manage(&_tmp_a);
213 _memory_group.manage(&_tmp_b);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100215 // Configure interleave kernel
216 _interleave_kernel.configure(a, &_tmp_a);
217
218 // Configure transpose kernel
219 _transpose_kernel.configure(b, &_tmp_b);
220
221 // Configure matrix multiplication kernel
222 _mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
223
224 // Allocate once the all configure methods have been called
225 _tmp_a.allocator()->allocate();
226 _tmp_b.allocator()->allocate();
227
228 // Configure matrix addition kernel
229 if(beta != 0 && c != nullptr)
230 {
231 _ma_kernel.configure(c, d, beta);
232 _run_addition = true;
233 }
234 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100235 }
236}
237
238void NEGEMM::run()
239{
Georgios Pinitas658039b2017-09-15 16:30:50 +0100240 _memory_group.acquire();
241
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100242 if(_mm_optimised_kernel != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100243 {
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100244 NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
245 _memory_group.release();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100246 }
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100247 else
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100248 {
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100249 if(!_run_vector_matrix_multiplication)
250 {
251 // Run interleave kernel
252 NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
253
Gian Marco1d25ed52017-12-16 19:33:50 +0000254 if(_is_first_run)
255 {
256 // Run transpose kernel
257 NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
258
259 _is_first_run = false;
260 }
261 else if(!_reshape_b_only_on_first_run)
262 {
263 // Run transpose kernel
264 NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
265 }
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100266 }
267
268 NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
269
270 _memory_group.release();
271
272 // Run matrix addition kernel
273 if(_run_addition)
274 {
275 NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
276 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100277 }
278}
Moritz Pflanzerbeabe3b2017-08-31 14:56:32 +0100279} // namespace arm_compute