Blame - src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp - ml/ComputeLibrary

blob: 2a4a46e76c44af4df001e60fa6d3ce167062beb2 [file] [log] [blame]

Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/NEON/INEKernel.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Validate.h"
				32	#include "arm_compute/core/Window.h"
				33
				34	#include <arm_neon.h>
				35	#include <cstddef>
				36	#include <cstdint>
				37	#include <tuple>
				38
				39	using namespace arm_compute;
				40
				41	namespace
				42	{
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	43	inline void gemm_interleave_blocked_transposed_8bit(const ITensor input, ITensor output, const Window &window, unsigned int block_width, unsigned int block_height)
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	44	{
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	45	const size_t in_stride = input->info()->strides_in_bytes()[1];
				46
				47	const unsigned int in_height = input->info()->dimension(1);
				48	const unsigned int in_width = input->info()->dimension(0);
				49
				50	const float scale_y_factor = 1.f / float(block_height);
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	51
				52	// Set window for output tensor
				53	Window win_out(window);
				54	win_out.scale(Window::DimY, scale_y_factor);
				55	Iterator in(input, window);
				56
				57	win_out.set_dimension_step(Window::DimX, block_width * block_height);
				58	Iterator out(output, win_out);
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	59
				60	execute_window_loop(window, [&](const Coordinates &)
				61	{
				62	std::fill_n(out.ptr(), block_width * block_height, 0);
				63	},
				64	out);
				65
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	66	execute_window_loop(window, [&](const Coordinates & id)
				67	{
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	68	for(unsigned int z = id.y(); (z < in_width) && z < (id.y() + block_height); ++z)
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	69	{
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	70	int j = (z - id.y()) * block_width;
				71	for(unsigned int b = id.x(); (b < in_height) && (b < (id.x() + block_width)); ++b)
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	72	{
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	73	(out.ptr() + j++) = (input->buffer() + b * in_stride + z);
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	74	}
				75	}
				76	},
				77	in, out);
				78	}
				79
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	80	inline void gemm_interleave_blocked_8bit(const ITensor input, ITensor output, const Window &window, unsigned int block_width, unsigned int block_height)
				81	{
				82	const size_t in_stride = input->info()->strides_in_bytes()[1];
				83
				84	const unsigned int in_height = input->info()->dimension(1);
				85	const unsigned int in_width = input->info()->dimension(0);
				86
				87	const float scale_y_factor = 1.f / float(block_height);
				88
				89	// Set window for output tensor
				90	Window win_out(window);
				91	win_out.scale(Window::DimY, scale_y_factor);
				92	Iterator in(input, window);
				93
				94	win_out.set_dimension_step(Window::DimX, block_width * block_height);
				95	Iterator out(output, win_out);
				96
				97	execute_window_loop(window, [&](const Coordinates &)
				98	{
				99	std::fill_n(out.ptr(), block_width * block_height, 0);
				100	},
				101	out);
				102
				103	execute_window_loop(window, [&](const Coordinates & id)
				104	{
				105	for(unsigned int z = id.y(); (z < in_height) && z < (id.y() + block_height); ++z)
				106	{
				107	int j = (z - id.y()) * block_width;
				108	for(unsigned int b = id.x(); (b < in_width) && (b < (id.x() + block_width)); ++b)
				109	{
				110	(out.ptr() + j++) = (input->buffer() + z * in_stride + b);
				111	}
				112	}
				113	},
				114	in, out);
				115	}
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	116	} // namespace
				117
				118	NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
				119	: _block_height(0), _block_width(0), _transpose(false)
				120	{
				121	}
				122
				123	void NEGEMMInterleaveBlockedKernel::configure(const ITensor input, ITensor output, unsigned int block_height, unsigned int block_width, bool transpose)
				124	{
				125	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				126	ARM_COMPUTE_ERROR_ON_NULLPTR(output);
				127	ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
				128	ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
				129
				130	TensorShape output_shape = input->info()->tensor_shape();
				131	const float interleave_by_f32 = block_height;
				132	output_shape.set(0, input->info()->dimension(0) * interleave_by_f32);
				133	output_shape.set(1, std::ceil(static_cast<float>(input->info()->dimension(1)) / interleave_by_f32));
				134	// Output auto inizialitation if not yet initialized
				135	auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
				136	ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
				137	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				138	ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
				139
				140	_input = input;
				141	_output = output;
				142	_block_height = block_height;
				143	_block_width = block_width;
				144	_transpose = transpose;
				145
				146	const unsigned int num_elems_processed_per_iteration_x = block_width;
				147	const unsigned int num_elems_processed_per_iteration_y = block_height;
				148
				149	// Configure kernel window
				150	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				151	const float scaley_factor = 1.f / interleave_by_f32;
				152
				153	AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor);
				154	AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
				155	update_window_and_padding(win, output_access, input_access);
				156
				157	output_access.set_valid_region(win, input->info()->valid_region());
				158
				159	INEKernel::configure(win);
				160	}
				161
				162	void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
				163	{
				164	ARM_COMPUTE_UNUSED(info);
				165	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				166	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Pablo Tello	eae4ce0	2017-10-13 15:24:49 +0100	[diff] [blame]	167	if(_transpose)
				168	{
				169	gemm_interleave_blocked_transposed_8bit(_input, _output, window, _block_width, _block_height);
				170	}
				171	else
				172	{
				173	gemm_interleave_blocked_8bit(_input, _output, window, _block_width, _block_height);
				174	}
Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	175	}