Blame - src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp - ml/ComputeLibrary

blob: c929983162c71776fbb92e5f3474d68f10020e19 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Isabella Gottardi	0a1090a	2019-02-14 18:07:36 +0000	[diff] [blame]	2	* Copyright (c) 2016-2019 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/NEON/INEKernel.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Validate.h"
				32	#include "arm_compute/core/Window.h"
Isabella Gottardi	e6630e4	2018-01-18 15:50:39 +0000	[diff] [blame]	33	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	34
				35	#include <arm_neon.h>
				36	#include <cstddef>
				37	#include <cstdint>
				38	#include <tuple>
				39
				40	using namespace arm_compute;
Isabella Gottardi	e6630e4	2018-01-18 15:50:39 +0000	[diff] [blame]	41	using namespace arm_compute::misc::shape_calculator;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	42
				43	namespace
				44	{
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	45	Status validate_arguments(const ITensorInfo input, const ITensorInfo output)
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	46	{
Anthony Barbier	eaefd00	2018-07-20 17:49:35 +0100	[diff] [blame]	47	//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
Vidhya Sudhan Loganathan	7485d5a	2018-07-04 09:34:00 +0100	[diff] [blame]	48	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
				49	DataType::U16, DataType::S16, DataType::U32, DataType::S32,
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	50	DataType::F16, DataType::F32);
				51	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	52
				53	if(output->total_size() != 0)
				54	{
				55	TensorShape output_shape = input->tensor_shape();
				56	output_shape.set(0, input->dimension(0) * 4);
				57	output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
				58	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
				59	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
Isabella Gottardi	0a1090a	2019-02-14 18:07:36 +0000	[diff] [blame]	60	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	61	}
				62
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	63	return Status{};
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	64	}
				65
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	66	std::pair<Status, Window> validate_and_configure_window(ITensorInfo input, ITensorInfo output)
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	67	{
				68	unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
				69	constexpr unsigned int num_elems_processed_per_iteration_y = 4;
				70	bool window_changed = false;
				71
				72	// Configure kernel window
				73	Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				74	AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
				75	window_changed = window_changed \|\| update_window_and_padding(win, input_access);
				76
				77	// Configure window in case of configured output
				78	if(output->total_size() != 0)
				79	{
				80	AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
				81	window_changed = window_changed \|\| update_window_and_padding(win, output_access);
				82	output_access.set_valid_region(win, input->valid_region());
				83	}
				84
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	85	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	86	return std::make_pair(err, win);
				87	}
				88
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	89	void gemm_interleave_8bit_elements(const ITensor input, ITensor output, const Window &window)
				90	{
				91	const size_t in_stride = input->info()->strides_in_bytes()[1];
				92
				93	// Set window for output tensor
				94	Window win_out(window);
				95	win_out.scale(Window::DimY, 0.25f);
				96	Iterator in(input, window);
				97
				98	win_out.set_dimension_step(Window::DimX, 32);
				99	Iterator out(output, win_out);
				100
				101	execute_window_loop(window, [&](const Coordinates &)
				102	{
				103	const uint8x8x4_t data =
				104	{
				105	{
				106	vld1_u8(in.ptr() + 0 * in_stride),
				107	vld1_u8(in.ptr() + 1 * in_stride),
				108	vld1_u8(in.ptr() + 2 * in_stride),
				109	vld1_u8(in.ptr() + 3 * in_stride),
				110	}
				111	};
				112	vst4_u8(out.ptr(), data);
				113	},
				114	in, out);
				115	}
				116
				117	void gemm_interleave_16bit_elements(const ITensor input, ITensor output, const Window &window)
				118	{
				119	const size_t in_stride = input->info()->strides_in_bytes()[1];
				120
				121	// Set window for output tensor
				122	Window win_out(window);
				123	win_out.scale(Window::DimY, 0.25f);
				124	Iterator in(input, window);
				125
				126	win_out.set_dimension_step(Window::DimX, 16);
				127	Iterator out(output, win_out);
				128
Michalis Spyrou	bcfd09a	2019-05-01 13:03:59 +0100	[diff] [blame^]	129	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	130	{
				131	const uint16x4x4_t data =
				132	{
				133	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	134	vld1_u16(reinterpret_cast<const uint16_t >(in.ptr() + 0 in_stride)),
				135	vld1_u16(reinterpret_cast<const uint16_t >(in.ptr() + 1 in_stride)),
				136	vld1_u16(reinterpret_cast<const uint16_t >(in.ptr() + 2 in_stride)),
				137	vld1_u16(reinterpret_cast<const uint16_t >(in.ptr() + 3 in_stride)),
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	138	}
				139	};
				140	vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
				141	},
				142	in, out);
				143	}
				144
				145	void gemm_interleave_32bit_elements(const ITensor input, ITensor output, const Window &window)
				146	{
				147	const size_t in_stride = input->info()->strides_in_bytes()[1];
				148
				149	// Set window for output tensor
				150	Window win_out(window);
				151	win_out.scale(Window::DimY, 0.25f);
				152	Iterator in(input, window);
				153
				154	win_out.set_dimension_step(Window::DimX, 16);
				155	Iterator out(output, win_out);
				156
Michalis Spyrou	bcfd09a	2019-05-01 13:03:59 +0100	[diff] [blame^]	157	execute_window_loop(window, [&](const Coordinates &)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	158	{
				159	const uint32x4x4_t data =
				160	{
				161	{
Moritz Pflanzer	beabe3b	2017-08-31 14:56:32 +0100	[diff] [blame]	162	vld1q_u32(reinterpret_cast<const uint32_t >(in.ptr() + 0 in_stride)),
				163	vld1q_u32(reinterpret_cast<const uint32_t >(in.ptr() + 1 in_stride)),
				164	vld1q_u32(reinterpret_cast<const uint32_t >(in.ptr() + 2 in_stride)),
				165	vld1q_u32(reinterpret_cast<const uint32_t >(in.ptr() + 3 in_stride))
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	166	}
				167	};
				168	vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
				169	},
				170	in, out);
				171	}
				172	} // namespace
				173
				174	NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
				175	: _func(nullptr)
				176	{
				177	}
				178
				179	void NEGEMMInterleave4x4Kernel::configure(const ITensor input, ITensor output)
				180	{
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	181	ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
Gian Marco Iodice	ec8b45e	2017-06-22 13:00:39 +0100	[diff] [blame]	182
Gian Marco Iodice	ec8b45e	2017-06-22 13:00:39 +0100	[diff] [blame]	183	// Output auto inizialitation if not yet initialized
Isabella Gottardi	e6630e4	2018-01-18 15:50:39 +0000	[diff] [blame]	184	auto_init_if_empty(output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(input->info())));
Gian Marco Iodice	ec8b45e	2017-06-22 13:00:39 +0100	[diff] [blame]	185
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	186	// Perform validate step
				187	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	188
				189	_input = input;
				190	_output = output;
				191
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	192	switch(input->info()->element_size())
				193	{
				194	case 1:
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	195	_func = &gemm_interleave_8bit_elements;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	196	break;
				197	case 2:
				198	_func = &gemm_interleave_16bit_elements;
				199	break;
				200	case 4:
				201	_func = &gemm_interleave_32bit_elements;
				202	break;
				203	default:
				204	ARM_COMPUTE_ERROR_ON("Element size not supported");
				205	break;
				206	}
				207
				208	// Configure kernel window
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	209	auto win_config = validate_and_configure_window(input->info(), output->info());
				210	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				211	INEKernel::configure(win_config.second);
				212	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	213
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	214	Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo input, const ITensorInfo output)
Georgios Pinitas	a3b1b46	2017-11-16 19:24:39 +0000	[diff] [blame]	215	{
				216	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
				217	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	218
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	219	return Status{};
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	220	}
				221
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	222	void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	223	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	224	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	225	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				226	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				227	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				228	/*
				229	* This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
				230	* \|a00 a01 a02 a03\|
				231	* \|a10 a11 a12 a13\|
				232	* \|a20 a21 a22 a23\| = \| a00 a10 a20 a30 \|\| a01 a11 a21 a31 \|\| a02 a12 a22 a32 \|\| a03 a13 a23 a33 \|
				233	* \|a30 a31 a32 a33\|
				234	*
				235	* After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
				236	*/
				237	(*_func)(_input, _output, window);
				238	}