Blame - src/runtime/NEON/functions/NEGEMM.cpp - ml/ComputeLibrary

blob: 1d6aa65e37c2a4448e83ecf97ceea9266b40a5f4 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	29	#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Types.h"
				32	#include "arm_compute/core/Validate.h"
				33	#include "arm_compute/runtime/NEON/NEScheduler.h"
				34	#include "arm_compute/runtime/TensorAllocator.h"
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	35	#include "support/ToolchainSupport.h"
				36
				37	namespace arm_compute
				38	{
				39	#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
				40	#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
				41	} // namespace arm_compute
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	42
				43	#include <cmath>
				44
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	45	namespace arm_compute
				46	{
Georgios Pinitas	658039b	2017-09-15 16:30:50 +0100	[diff] [blame]	47	NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	48	: _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
				49	_run_vector_matrix_multiplication(false), _run_addition(false)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	50	{
				51	}
				52
				53	void NEGEMM::configure(const ITensor a, const ITensor b, const ITensor c, ITensor d, float alpha, float beta)
				54	{
Gian Marco Iodice	bdb6b0b	2017-06-30 12:21:00 +0100	[diff] [blame]	55	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
Gian Marco Iodice	3a3066b	2017-06-23 13:38:14 +0100	[diff] [blame]	56	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
Gian Marco Iodice	bdb6b0b	2017-06-30 12:21:00 +0100	[diff] [blame]	57	ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	58
				59	if(c != nullptr)
				60	{
Gian Marco Iodice	bdb6b0b	2017-06-30 12:21:00 +0100	[diff] [blame]	61	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(c, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	62	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
				63	ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
				64	ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
				65	ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != d->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
				66	ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
				67	}
				68
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	69	_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	70
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	71	#if defined(__aarch64__)
				72	if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr \|\| beta == 0.f))
				73	{
				74	_mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
				75	}
				76	#endif /* defined(__aarch64__) */
				77
				78	// Check if the first input tensor is a vector.
				79	// If so, all the kernels for reshaping the tensors can be skipped
				80	if(_run_vector_matrix_multiplication)
				81	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	82	// Configure the matrix multiply kernel
				83	_mm_kernel.configure(a, b, d, alpha);
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	84
				85	// Configure matrix addition kernel
				86	if(beta != 0 && c != nullptr)
				87	{
				88	_ma_kernel.configure(c, d, beta);
				89	_run_addition = true;
				90	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	91	}
				92	else
				93	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	94	#if defined(__aarch64__)
				95	if(_mm_optimised_kernel != nullptr)
				96	{
				97	struct CPUInfo ci = NEScheduler::get().cpu_info();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	98
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	99	const int M = d->info()->tensor_shape().y();
				100	const int N = d->info()->tensor_shape().x();
				101	const int K = a->info()->tensor_shape().x();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	102
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	103	GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	104
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	105	constexpr size_t alignment = 4096;
				106	_workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
				107	_memory_group.manage(&_workspace);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	108
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	109	// Configure matrix multiplication kernel
				110	_mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	111
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	112	_workspace.allocator()->allocate();
				113	}
				114	else
				115	#endif /* defined(__aarch64__) */
				116	{
				117	TensorShape shape_tmp_a = a->info()->tensor_shape();
				118	TensorShape shape_tmp_b = b->info()->tensor_shape();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	119
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	120	shape_tmp_a.set(0, a->info()->dimension(0) * 4);
				121	shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
Georgios Pinitas	658039b	2017-09-15 16:30:50 +0100	[diff] [blame]	122
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	123	const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
				124	shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
				125	shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	126
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	127	TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
				128	TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), a->info()->fixed_point_position());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	129
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	130	_tmp_a.allocator()->init(info_a);
				131	_tmp_b.allocator()->init(info_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	132
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	133	// Manage intermediate buffers
				134	_memory_group.manage(&_tmp_a);
				135	_memory_group.manage(&_tmp_b);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	136
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	137	// Configure interleave kernel
				138	_interleave_kernel.configure(a, &_tmp_a);
				139
				140	// Configure transpose kernel
				141	_transpose_kernel.configure(b, &_tmp_b);
				142
				143	// Configure matrix multiplication kernel
				144	_mm_kernel.configure(&_tmp_a, &_tmp_b, d, alpha);
				145
				146	// Allocate once the all configure methods have been called
				147	_tmp_a.allocator()->allocate();
				148	_tmp_b.allocator()->allocate();
				149
				150	// Configure matrix addition kernel
				151	if(beta != 0 && c != nullptr)
				152	{
				153	_ma_kernel.configure(c, d, beta);
				154	_run_addition = true;
				155	}
				156	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	157	}
				158	}
				159
				160	void NEGEMM::run()
				161	{
Georgios Pinitas	658039b	2017-09-15 16:30:50 +0100	[diff] [blame]	162	_memory_group.acquire();
				163
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	164	if(_mm_optimised_kernel != nullptr)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	165	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	166	NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
				167	_memory_group.release();
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	168	}
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	169	else
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	170	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	171	if(!_run_vector_matrix_multiplication)
				172	{
				173	// Run interleave kernel
				174	NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
				175
				176	// Run transpose kernel
				177	NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
				178	}
				179
				180	NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
				181
				182	_memory_group.release();
				183
				184	// Run matrix addition kernel
				185	if(_run_addition)
				186	{
				187	NEScheduler::get().schedule(&_ma_kernel, Window::DimY);
				188	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	189	}
				190	}
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	191	} // namespace arm_compute