Blame - src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp - ml/ComputeLibrary

blob: ccf73883f012f62a7f98b4bd1dda312647aa2134 [file] [log] [blame]

Michele Di Giorgio	1928904	2021-02-03 16:05:00 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
				25	#include "arm_compute/core/Utils.h"
				26	#include "arm_compute/core/Validate.h"
				27	#include "arm_compute/core/utils/misc/ShapeCalculator.h"
				28	#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
				29	#include "src/core/CPP/Validate.h"
				30	#include "src/core/NEON/INEKernel.h"
				31	#include "src/core/helpers/AutoConfiguration.h"
				32	#include "src/core/helpers/WindowHelpers.h"
				33
				34	#include <arm_neon.h>
				35
				36	namespace arm_compute
				37	{
				38	namespace cpu
				39	{
				40	namespace kernels
				41	{
				42	using namespace arm_compute::misc::shape_calculator;
				43
				44	void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo src, ITensorInfo dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
				45	{
				46	ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
				47
				48	// dst initialization if not yet initialized
				49	auto_init_if_empty(dst, src->clone()->set_tensor_shape(compute_pool_shape(src, info)));
				50
				51	const bool requantize = src->quantization_info() != dst->quantization_info();
				52
				53	switch(src->data_type())
				54	{
				55	case DataType::QASYMM8:
				56	if(requantize)
				57	{
				58	create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
				59	}
				60	else
				61	{
				62	create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
				63	}
				64	break;
				65	case DataType::QASYMM8_SIGNED:
				66	if(requantize)
				67	{
				68	create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
				69	}
				70	else
				71	{
				72	create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
				73	}
				74	break;
				75	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				76	case DataType::F16:
				77	create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
				78	break;
				79	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				80	case DataType::F32:
				81	create_arm_pooling<float, float>(src, dst, info, cpu_info);
				82	break;
				83	default:
				84	break;
				85	}
				86
				87	Window win = calculate_max_window(*dst, Steps());
				88	INEKernel::configure(win);
				89	}
				90
				91	Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo src, const ITensorInfo dst, const PoolingLayerInfo &info)
				92	{
				93	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
				94
				95	#ifndef __aarch64__
				96	ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
				97	#endif /* __aarch64__ */
				98	ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
				99	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
				100	ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) \|\| (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
				101	ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
				102	"Only AVG and MAX pooling are supported by assembly kernels");
				103
				104	if(dst->total_size() > 0)
				105	{
				106	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
				107
				108	const auto src_qinfo = src->quantization_info().uniform();
				109	const auto dst_qinfo = dst->quantization_info().uniform();
				110
				111	if(src_qinfo != dst_qinfo)
				112	{
				113	const float multiplier = src_qinfo.scale / dst_qinfo.scale;
				114	int32_t dst_multiplier{};
				115	int32_t dst_shift{};
				116	ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
				117	}
				118	else
				119	{
				120	if(src->data_type() == DataType::QASYMM8)
				121	{
				122	const bool has_padding = info.pad_stride_info.has_padding();
				123	ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
				124	}
				125	}
				126	}
				127	else
				128	{
				129	if(src->data_type() == DataType::QASYMM8)
				130	{
				131	// If dst is not configured, the quantization info are the same
				132	const bool has_padding = info.pad_stride_info.has_padding();
				133	ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
				134	}
				135	}
				136	return Status{};
				137	}
				138
				139	void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
				140	{
				141	ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
				142	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				143	ARM_COMPUTE_UNUSED(window);
				144	ARM_COMPUTE_UNUSED(info);
				145
				146	ARM_COMPUTE_ERROR_ON(tensors.empty());
				147
				148	const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
Michele Di Giorgio	0c19cbd	2021-05-11 17:41:32 +0100	[diff] [blame^]	149	ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
				150	ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
Michele Di Giorgio	1928904	2021-02-03 16:05:00 +0000	[diff] [blame]	151
				152	const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
				153	auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
				154	auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
				155
				156	const auto src_shape = src->info()->tensor_shape();
				157	const auto dst_shape = dst->info()->tensor_shape();
				158	const auto src_padding = src->info()->padding();
				159	const auto dst_padding = dst->info()->padding();
				160
				161	const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
				162	const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
				163	const size_t ld_src_batch = ld_src_row * src_shape[2];
				164	const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
				165	const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
				166	const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
				167
				168	_kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
				169	out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
				170	working_space, info.thread_id, info.num_threads);
				171	}
				172
				173	size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
				174	{
				175	return _kernel_asm->get_working_size(num_threads);
				176	}
				177
				178	bool CpuPoolingAssemblyWrapperKernel::is_configured() const
				179	{
				180	return _kernel_asm != nullptr;
				181	}
				182
				183	template <typename Typesrc, typename Typedst>
				184	void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo src, ITensorInfo dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
				185	{
				186	const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
				187
				188	arm_conv::pooling::PoolingWindow window{};
				189	window.cols = static_cast<unsigned int>(info.pool_size.x());
				190	window.rows = static_cast<unsigned int>(info.pool_size.y());
				191
				192	arm_conv::pooling::PoolingStride stride{};
				193	std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
				194
				195	const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
				196
				197	constexpr unsigned int idx_width = 1;
				198	constexpr unsigned int idx_height = 2;
				199	constexpr unsigned int idx_channels = 0;
				200	constexpr unsigned int idx_batches = 3;
				201
				202	const unsigned int n_batches = src->dimension(idx_batches);
				203	const unsigned int src_rows = src->dimension(idx_height);
				204	const unsigned int src_cols = src->dimension(idx_width);
				205	const unsigned int n_channels = src->dimension(idx_channels);
				206	const unsigned int dst_rows = dst->dimension(idx_height);
				207	const unsigned int dst_cols = dst->dimension(idx_width);
				208
				209	arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
				210
				211	// Configure assembly pooling kernel
				212	auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
				213	if(pooling_kernel_asm == nullptr)
				214	{
				215	// Configuration not supported: Leave function unconfigured:
				216	return;
				217	}
				218
				219	_kernel_asm = std::move(pooling_kernel_asm);
				220	}
				221
				222	template <typename Typesrc, typename Typedst>
				223	void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo src, ITensorInfo dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
				224	{
				225	const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
				226
				227	arm_conv::pooling::PoolingWindow window{};
				228	window.cols = static_cast<unsigned int>(info.pool_size.x());
				229	window.rows = static_cast<unsigned int>(info.pool_size.y());
				230
				231	arm_conv::pooling::PoolingStride stride{};
				232	std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
				233
				234	const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
				235
				236	constexpr unsigned int idx_width = 1;
				237	constexpr unsigned int idx_height = 2;
				238	constexpr unsigned int idx_channels = 0;
				239	constexpr unsigned int idx_batches = 3;
				240
				241	const unsigned int n_batches = src->dimension(idx_batches);
				242	const unsigned int src_rows = src->dimension(idx_height);
				243	const unsigned int src_cols = src->dimension(idx_width);
				244	const unsigned int n_channels = src->dimension(idx_channels);
				245	const unsigned int dst_rows = dst->dimension(idx_height);
				246	const unsigned int dst_cols = dst->dimension(idx_width);
				247
				248	arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
				249
				250	const auto src_qinfo = src->quantization_info().uniform();
				251	const auto dst_qinfo = dst->quantization_info().uniform();
				252
				253	const float multiplier = src_qinfo.scale / dst_qinfo.scale;
				254	int32_t dst_multiplier{};
				255	int32_t dst_shift{};
				256	quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
				257
				258	const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
				259	dst_qinfo.offset,
				260	dst_shift, // left shift
				261	0, // right shift
				262	dst_multiplier);
				263
				264	// Configure assembly pooling kernel with requantization
				265	auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
				266	if(pooling_kernel_asm == nullptr)
				267	{
				268	// Configuration not supported: Leave function unconfigured:
				269	return;
				270	}
				271
				272	_kernel_asm = std::move(pooling_kernel_asm);
				273	}
				274	} // namespace kernels
				275	} // namespace cpu
				276	} // namespace arm_compute