Blame - src/runtime/NEON/functions/NEGEMM.cpp - ml/ComputeLibrary

blob: 48a0d2af1ce61dbf57b4e7f68e7e1e43adb820ed [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Anthony Barbier	f1df346	2018-01-31 09:13:37 +0000	[diff] [blame]	2	* Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	29	#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	30	#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
Michele Di Giorgio	5b6904b	2018-01-29 12:24:14 +0000	[diff] [blame]	31	#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	32	#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	33	#include "arm_compute/core/TensorInfo.h"
				34	#include "arm_compute/core/Types.h"
				35	#include "arm_compute/core/Validate.h"
				36	#include "arm_compute/runtime/NEON/NEScheduler.h"
				37	#include "arm_compute/runtime/TensorAllocator.h"
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	38	#include "support/ToolchainSupport.h"
				39
				40	namespace arm_compute
				41	{
Anthony Barbier	f1df346	2018-01-31 09:13:37 +0000	[diff] [blame]	42	#pragma GCC diagnostic push
				43	#pragma GCC diagnostic ignored "-Wswitch-default"
Michele Di Giorgio	5b6904b	2018-01-29 12:24:14 +0000	[diff] [blame]	44	#pragma GCC diagnostic ignored "-Weffc++"
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	45	#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
Michele Di Giorgio	5b6904b	2018-01-29 12:24:14 +0000	[diff] [blame]	46	#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	47	#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	48	#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	49	#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
Michele Di Giorgio	5b6904b	2018-01-29 12:24:14 +0000	[diff] [blame]	50	#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
Anthony Barbier	f1df346	2018-01-31 09:13:37 +0000	[diff] [blame]	51	#pragma GCC diagnostic pop
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	52	} // namespace arm_compute
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	53
				54	#include <cmath>
				55
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	56	namespace arm_compute
				57	{
Georgios Pinitas	658039b	2017-09-15 16:30:50 +0100	[diff] [blame]	58	NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	59	: _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	60	_run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	61	{
				62	}
				63
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	64	void NEGEMM::configure(const ITensor a, const ITensor b, const ITensor c, ITensor d, float alpha, float beta, const GEMMInfo &gemm_info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	65	{
Gian Marco Iodice	bdb6b0b	2017-06-30 12:21:00 +0100	[diff] [blame]	66	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	67	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
Gian Marco Iodice	bdb6b0b	2017-06-30 12:21:00 +0100	[diff] [blame]	68	ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	69	ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
				70	ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	71
				72	if(c != nullptr)
				73	{
Gian Marco Iodice	bdb6b0b	2017-06-30 12:21:00 +0100	[diff] [blame]	74	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	75	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
				76	ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
				77	ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
				78	ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
				79	ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
				80	}
				81
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	82	// Check if we need to reshape the matrix B only on the first run
				83	_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	84	_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	85
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	86	// Check if the first input tensor is a vector.
				87	// If so, all the kernels for reshaping the tensors can be skipped
				88	if(_run_vector_matrix_multiplication)
				89	{
Michele Di Giorgio	5b6904b	2018-01-29 12:24:14 +0000	[diff] [blame]	90	#if defined(__aarch64__)
				91	if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr \|\| beta == 0.f))
				92	{
				93	_mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
				94	}
				95
				96	if(_mm_optimised_kernel != nullptr)
				97	{
				98	struct CPUInfo ci = NEScheduler::get().cpu_info();
				99
				100	const int N = d->info()->tensor_shape().x();
				101	const int K = a->info()->tensor_shape().x();
				102
				103	size_t workbench_size = 0;
				104
				105	if(a->info()->data_type() == DataType::F32)
				106	{
				107	workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
				108	}
				109
				110	constexpr size_t alignment = 4096;
				111	ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
				112	_workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
				113	_memory_group.manage(&_workspace);
				114
				115	// Configure matrix multiplication kernel
				116	_mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 /, false / is_transposed_1 */);
				117	_workspace.allocator()->allocate();
				118	}
				119	else
				120	#endif /* defined(__aarch64__) */
				121	{
				122	// Configure the matrix multiply kernel
				123	_mm_kernel.configure(a, b, d, alpha);
				124	}
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	125
				126	// Configure matrix addition kernel
				127	if(beta != 0 && c != nullptr)
				128	{
				129	_ma_kernel.configure(c, d, beta);
				130	_run_addition = true;
				131	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	132	}
				133	else
				134	{
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	135	#if defined(__arm__)
				136	if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr \|\| beta == 0.f))
				137	{
				138	_mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
				139	}
				140	#elif defined(__aarch64__)
				141	if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr \|\| beta == 0.f))
				142	{
				143	_mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
				144	}
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	145	else if(a->info()->data_type() == DataType::F16 && (c == nullptr \|\| beta == 0.f))
				146	{
				147	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				148	_mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
				149	#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				150	ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
				151	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				152	}
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	153	#endif /* defined(__arm__) \|\| defined(__aarch64__) */
				154
				155	#if defined(__arm__) \|\| defined(__aarch64__)
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	156	if(_mm_optimised_kernel != nullptr)
				157	{
				158	struct CPUInfo ci = NEScheduler::get().cpu_info();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	159
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	160	const int M = d->info()->tensor_shape().y();
				161	const int N = d->info()->tensor_shape().x();
				162	const int K = a->info()->tensor_shape().x();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	163
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	164	size_t workbench_size = 0;
				165
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	166	#if defined(__arm__)
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	167	workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	168	#elif defined(__aarch64__)
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	169	if(a->info()->data_type() == DataType::F32)
				170	{
				171	workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
				172	}
				173	else if(a->info()->data_type() == DataType::F16)
				174	{
				175	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				176	workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
				177	#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				178	ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
				179	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				180	}
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	181	#endif /* defined(__arm__) \|\| defined(__aarch64__) */
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	182
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	183	constexpr size_t alignment = 4096;
Pablo Tello	4d55e0a	2017-11-10 15:57:14 +0000	[diff] [blame]	184	ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
				185	_workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	186	_memory_group.manage(&_workspace);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	187
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	188	// Configure matrix multiplication kernel
Georgios Pinitas	08c5a06	2017-12-14 17:53:39 +0000	[diff] [blame]	189	_mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 /, false / is_transposed_1 */);
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	190	_workspace.allocator()->allocate();
				191	}
				192	else
Moritz Pflanzer	80373f6	2017-09-15 10:42:58 +0100	[diff] [blame]	193	#endif /* defined(__arm__) \|\| defined(__aarch64__) */
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	194	{
				195	TensorShape shape_tmp_a = a->info()->tensor_shape();
				196	TensorShape shape_tmp_b = b->info()->tensor_shape();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	197
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	198	shape_tmp_a.set(0, a->info()->dimension(0) * 4);
				199	shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
Georgios Pinitas	658039b	2017-09-15 16:30:50 +0100	[diff] [blame]	200
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	201	const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
				202	shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
				203	shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	204
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	205	TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
				206	TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	207
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	208	_tmp_a.allocator()->init(info_a);
				209	_tmp_b.allocator()->init(info_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	210
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	211	// Manage intermediate buffers
				212	_memory_group.manage(&_tmp_a);
				213	_memory_group.manage(&_tmp_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	214
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	215	// Configure interleave kernel
				216	_interleave_kernel.configure(a, &_tmp_a);
				217
				218	// Configure transpose kernel
				219	_transpose_kernel.configure(b, &_tmp_b);
				220
				221	// Configure matrix multiplication kernel
				222	_mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
				223
				224	// Allocate once the all configure methods have been called
				225	_tmp_a.allocator()->allocate();
				226	_tmp_b.allocator()->allocate();
				227
				228	// Configure matrix addition kernel
				229	if(beta != 0 && c != nullptr)
				230	{
				231	_ma_kernel.configure(c, d, beta);
				232	_run_addition = true;
				233	}
				234	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	235	}
				236	}
				237
				238	void NEGEMM::run()
				239	{
Georgios Pinitas	658039b	2017-09-15 16:30:50 +0100	[diff] [blame]	240	_memory_group.acquire();
				241
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	242	if(_mm_optimised_kernel != nullptr)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	243	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	244	NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
				245	_memory_group.release();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	246	}
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	247	else
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	248	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	249	if(!_run_vector_matrix_multiplication)
				250	{
				251	// Run interleave kernel
				252	NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
				253
Gian Marco	1d25ed5	2017-12-16 19:33:50 +0000	[diff] [blame]	254	if(_is_first_run)
				255	{
				256	// Run transpose kernel
				257	NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
				258
				259	_is_first_run = false;
				260	}
				261	else if(!_reshape_b_only_on_first_run)
				262	{
				263	// Run transpose kernel
				264	NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
				265	}
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	266	}
				267
				268	NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
				269
				270	_memory_group.release();
				271
				272	// Run matrix addition kernel
				273	if(_run_addition)
				274	{
				275	NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
				276	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	277	}
				278	}
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	279	} // namespace arm_compute