Blame - src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp - ml/ComputeLibrary

blob: 81376fb029e0bf73418f1d56d03cb0a65572cabb [file] [log] [blame]

Michele Di Giorgio	93b75e0	2021-06-21 12:00:43 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016-2021 Arm Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
Georgios Pinitas	7891a73	2021-08-20 21:39:25 +0100	[diff] [blame^]	24	#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
Michele Di Giorgio	93b75e0	2021-06-21 12:00:43 +0100	[diff] [blame]	25
				26	#include "arm_compute/core/Helpers.h"
				27	#include "arm_compute/core/Types.h"
				28	#include "arm_compute/core/Validate.h"
				29	#include "src/core/CPP/Validate.h"
				30	#include "src/core/NEON/NEFixedPoint.h"
				31	#include "src/core/helpers/AutoConfiguration.h"
				32	#include "src/core/helpers/WindowHelpers.h"
				33
				34	#include <arm_neon.h>
				35
				36	namespace arm_compute
				37	{
				38	namespace cpu
				39	{
				40	namespace kernels
				41	{
				42	namespace
				43	{
				44	void matrix_addition_f32(const ITensor src, ITensor dst, const Window &window, float beta)
				45	{
				46	ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
				47	const float32x4_t beta_f32 = vdupq_n_f32(beta);
				48
				49	constexpr int window_step_x = 16;
				50	const auto window_start_x = static_cast<int>(window.x().start());
				51	const auto window_end_x = static_cast<int>(window.x().end());
				52
				53	Window win = window.collapse_if_possible(window, Window::DimZ);
				54	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				55
				56	Iterator in(src, win);
				57	Iterator out(dst, win);
				58
				59	execute_window_loop(win, [&](const Coordinates &)
				60	{
				61	const auto in_ptr = reinterpret_cast<const float *>(in.ptr());
				62	const auto out_ptr = reinterpret_cast<float *>(out.ptr());
				63
				64	int x = window_start_x;
				65	for(; x < (window_end_x - window_step_x); x += window_step_x)
				66	{
				67	float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x);
				68	const float32x4x4_t c = vld4q_f32(in_ptr + x);
				69
				70	// Multiply matrix C by its weight and accumulate
				71	alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
				72	alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
				73	alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
				74	alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
				75
				76	vst4q_f32(out_ptr + x, alpha_ab);
				77	}
				78
				79	// Left-over loop
				80	for(; x < window_end_x; ++x)
				81	{
				82	(out_ptr + x) += (in_ptr + x) * beta;
				83	}
				84	},
				85	in, out);
				86	}
				87
				88	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				89	void matrix_addition_f16(const ITensor src, ITensor dst, const Window &window, float beta)
				90	{
				91	ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
				92	const float16x8_t beta_f16 = vdupq_n_f16(beta);
				93
				94	constexpr int window_step_x = 16;
				95	const auto window_start_x = static_cast<int>(window.x().start());
				96	const auto window_end_x = static_cast<int>(window.x().end());
				97
				98	Window win = window.collapse_if_possible(window, Window::DimZ);
				99	win.set(Window::DimX, Window::Dimension(0, 1, 1));
				100
				101	Iterator in(src, win);
				102	Iterator out(dst, win);
				103
Freddie Liardet	bb5d42c	2021-07-28 14:24:02 +0100	[diff] [blame]	104	execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio	93b75e0	2021-06-21 12:00:43 +0100	[diff] [blame]	105	{
				106	const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr());
				107	const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
				108
				109	int x = window_start_x;
				110	for(; x < (window_end_x - window_step_x); x += window_step_x)
				111	{
				112	float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x);
				113	const float16x8x2_t c = vld2q_f16(in_ptr + x);
				114	// Multiply matrix C by its weight and accumulate
				115	alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
				116	alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
				117
				118	vst2q_f16(out_ptr + x, alpha_ab);
				119	}
				120
				121	// Left-over loop
				122	for(; x < window_end_x; ++x)
				123	{
				124	(out_ptr + x) += (in_ptr + x) * static_cast<float16_t>(beta);
				125	}
				126	},
				127	in, out);
				128	}
				129	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				130
				131	} // namespace
				132
				133	void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo src, ITensorInfo dst, float beta)
				134	{
				135	ARM_COMPUTE_UNUSED(dst);
				136	ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
				137
				138	// Perform validation step
				139	ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
				140
				141	_beta = beta;
				142	switch(src->data_type())
				143	{
				144	case DataType::F32:
				145	_func = &matrix_addition_f32;
				146	break;
				147	case DataType::F16:
				148	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
				149	_func = &matrix_addition_f16;
				150	break;
				151	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
				152	default:
				153	ARM_COMPUTE_ERROR("Data type not supported");
				154	break;
				155	}
				156
				157	// Configure kernel window
				158	Window win = calculate_max_window(*src, Steps());
				159	ICPPKernel::configure(win);
				160	}
				161
				162	Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo src, const ITensorInfo dst, float beta)
				163	{
				164	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
				165	ARM_COMPUTE_UNUSED(beta);
				166
				167	ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
				168	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
				169
				170	if(dst->total_size() > 0)
				171	{
				172	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
				173	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
				174	}
				175	return Status{};
				176	}
				177
				178	void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
				179	{
				180	ARM_COMPUTE_UNUSED(info);
				181	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				182	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
				183	ARM_COMPUTE_ERROR_ON(tensors.empty());
				184
				185	const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
				186	ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
				187
				188	if(_beta != 0.0f)
				189	{
				190	(*_func)(src, dst, window, _beta);
				191	}
				192	}
				193
				194	const char *CpuGemmMatrixAdditionKernel::name() const
				195	{
				196	return "CpuGemmMatrixAdditionKernel";
				197	}
				198	} // namespace kernels
				199	} // namespace cpu
				200	} // namespace arm_compute