Blame - src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp - ml/ComputeLibrary

blob: a9c624abd08f7bbbbee4304a2c518194ec0716fa [file] [log] [blame]

Pablo Tello	bf2fb95	2017-09-29 16:43:25 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
				29	#include "arm_compute/core/NEON/INEKernel.h"
				30	#include "arm_compute/core/Types.h"
				31	#include "arm_compute/core/Validate.h"
				32	#include "arm_compute/core/Window.h"
				33
				34	#include <arm_neon.h>
				35	#include <cstddef>
				36	#include <cstdint>
				37	#include <tuple>
				38
				39	using namespace arm_compute;
				40
				41	namespace
				42	{
				43	inline void gemm_interleave_8bit_elements(const ITensor input, ITensor output, const Window &window, unsigned int block_width, unsigned int block_height, bool transpose)
				44	{
				45	const size_t in_stride = input->info()->strides_in_bytes()[1];
				46	const float scale_y_factor = 1.f / float(block_height);
				47
				48	// Set window for output tensor
				49	Window win_out(window);
				50	win_out.scale(Window::DimY, scale_y_factor);
				51	Iterator in(input, window);
				52
				53	win_out.set_dimension_step(Window::DimX, block_width * block_height);
				54	Iterator out(output, win_out);
				55	execute_window_loop(window, [&](const Coordinates & id)
				56	{
				57	int j = 0;
				58	for(unsigned int z = 0; z < block_height; ++z)
				59	{
				60	for(unsigned int b = 0; b < block_width; ++b)
				61	{
				62	if(!transpose)
				63	{
				64	const bool inbounds = (id.x() + b) < input->info()->dimension(0) && (id.y() + z) < input->info()->dimension(1);
				65	(out.ptr() + j++) = (inbounds) ? (in.ptr() + z * in_stride + b) : 0;
				66	}
				67	else
				68	{
				69	const bool inbounds = (id.x() + b) < input->info()->dimension(1) && (id.y() + z) < input->info()->dimension(0);
				70	const uint8_t value = (inbounds) ? (input->buffer() + (id.x() + b) in_stride + (id.y() + z)) : 0;
				71	*(out.ptr() + j++) = value;
				72	}
				73	}
				74	}
				75	},
				76	in, out);
				77	}
				78
				79	} // namespace
				80
				81	NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
				82	: _block_height(0), _block_width(0), _transpose(false)
				83	{
				84	}
				85
				86	void NEGEMMInterleaveBlockedKernel::configure(const ITensor input, ITensor output, unsigned int block_height, unsigned int block_width, bool transpose)
				87	{
				88	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
				89	ARM_COMPUTE_ERROR_ON_NULLPTR(output);
				90	ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
				91	ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
				92
				93	TensorShape output_shape = input->info()->tensor_shape();
				94	const float interleave_by_f32 = block_height;
				95	output_shape.set(0, input->info()->dimension(0) * interleave_by_f32);
				96	output_shape.set(1, std::ceil(static_cast<float>(input->info()->dimension(1)) / interleave_by_f32));
				97	// Output auto inizialitation if not yet initialized
				98	auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
				99	ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
				100	ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
				101	ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
				102
				103	_input = input;
				104	_output = output;
				105	_block_height = block_height;
				106	_block_width = block_width;
				107	_transpose = transpose;
				108
				109	const unsigned int num_elems_processed_per_iteration_x = block_width;
				110	const unsigned int num_elems_processed_per_iteration_y = block_height;
				111
				112	// Configure kernel window
				113	Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
				114	const float scaley_factor = 1.f / interleave_by_f32;
				115
				116	AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor);
				117	AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
				118	update_window_and_padding(win, output_access, input_access);
				119
				120	output_access.set_valid_region(win, input->info()->valid_region());
				121
				122	INEKernel::configure(win);
				123	}
				124
				125	void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
				126	{
				127	ARM_COMPUTE_UNUSED(info);
				128	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				129	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				130	gemm_interleave_8bit_elements(_input, _output, window, _block_width, _block_height, _transpose);
				131	}