Blame - src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp - ml/ComputeLibrary

blob: cac2a6bd058e39b938e6b3fdb677a95ad76f8d93 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/ITensor.h"
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	29	#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	30	#include "arm_compute/core/TensorInfo.h"
				31	#include "arm_compute/core/Validate.h"
				32
				33	#include <algorithm>
				34	#include <arm_neon.h>
				35	#include <cstdint>
				36	#include <map>
				37	#include <string>
				38
				39	using namespace arm_compute;
				40
				41	namespace arm_compute
				42	{
				43	class Coordinates;
				44	} // namespace arm_compute
				45
				46	namespace
				47	{
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	48	void sub_wrap_QS8_QS8_QS8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				49	{
				50	Iterator input1(in1, window);
				51	Iterator input2(in2, window);
				52	Iterator output(out, window);
				53
				54	execute_window_loop(window, [&](const Coordinates & id)
				55	{
				56	const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
				57	const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
				58
				59	vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vsubq_qs8(a, b));
				60	},
				61	input1, input2, output);
				62	}
				63
				64	void sub_saturate_QS8_QS8_QS8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				65	{
				66	Iterator input1(in1, window);
				67	Iterator input2(in2, window);
				68	Iterator output(out, window);
				69
				70	execute_window_loop(window, [&](const Coordinates & id)
				71	{
				72	const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
				73	const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
				74
				75	vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqsubq_qs8(a, b));
				76	},
				77	input1, input2, output);
				78	}
				79
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	80	void sub_wrap_U8_U8_U8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				81	{
				82	Iterator input1(in1, window);
				83	Iterator input2(in2, window);
				84	Iterator output(out, window);
				85
				86	execute_window_loop(window, [&](const Coordinates & id)
				87	{
				88	const uint8x16_t ta1 = vld1q_u8(input1.ptr());
				89	const uint8x16_t ta2 = vld1q_u8(input2.ptr());
				90
				91	vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
				92	},
				93	input1, input2, output);
				94	}
				95
				96	void sub_saturate_U8_U8_U8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				97	{
				98	Iterator input1(in1, window);
				99	Iterator input2(in2, window);
				100	Iterator output(out, window);
				101
				102	execute_window_loop(window, [&](const Coordinates & id)
				103	{
				104	const uint8x16_t ta1 = vld1q_u8(input1.ptr());
				105	const uint8x16_t ta2 = vld1q_u8(input2.ptr());
				106
				107	vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
				108	},
				109	input1, input2, output);
				110	}
				111
				112	void sub_wrap_S16_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				113	{
				114	Iterator input1(in1, window);
				115	Iterator input2(in2, window);
				116	Iterator output(out, window);
				117
				118	execute_window_loop(window, [&](const Coordinates & id)
				119	{
				120	const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				121	const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				122
				123	const int16x8x2_t ta3 =
				124	{
				125	{
				126	vsubq_s16(ta1.val[0], ta2.val[0]),
				127	vsubq_s16(ta1.val[1], ta2.val[1])
				128	}
				129	};
				130
				131	vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
				132	},
				133	input1, input2, output);
				134	}
				135
				136	void sub_saturate_S16_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				137	{
				138	Iterator input1(in1, window);
				139	Iterator input2(in2, window);
				140	Iterator output(out, window);
				141
				142	execute_window_loop(window, [&](const Coordinates & id)
				143	{
				144	const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				145	const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				146
				147	const int16x8x2_t ta3 =
				148	{
				149	{
				150	vqsubq_s16(ta1.val[0], ta2.val[0]),
				151	vqsubq_s16(ta1.val[1], ta2.val[1])
				152	}
				153	};
				154
				155	vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3);
				156	},
				157	input1, input2, output);
				158	}
				159
				160	void sub_F32_F32_F32(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				161	{
				162	Iterator input1(in1, window);
				163	Iterator input2(in2, window);
				164	Iterator output(out, window);
				165
				166	execute_window_loop(window, [&](const Coordinates & id)
				167	{
				168	const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
				169	const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
				170
				171	const float32x4x4_t ta3 =
				172	{
				173	{
				174	vsubq_f32(ta1.val[0], ta2.val[0]),
				175	vsubq_f32(ta1.val[1], ta2.val[1]),
				176	vsubq_f32(ta1.val[2], ta2.val[2]),
				177	vsubq_f32(ta1.val[3], ta2.val[3]),
				178	}
				179	};
				180
				181	vst4q_f32(reinterpret_cast<float *>(output.ptr()), ta3);
				182	},
				183	input1, input2, output);
				184	}
				185	void sub_wrap_S16_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				186	{
				187	Iterator input1(in1, window);
				188	Iterator input2(in2, window);
				189	Iterator output(out, window);
				190
				191	execute_window_loop(window, [&](const Coordinates & id)
				192	{
				193	const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
				194	int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				195	int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
				196
				197	a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
				198	a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
				199
				200	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
				201	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
				202	},
				203	input1, input2, output);
				204	}
				205
				206	void sub_saturate_S16_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				207	{
				208	Iterator input1(in1, window);
				209	Iterator input2(in2, window);
				210	Iterator output(out, window);
				211
				212	execute_window_loop(window, [&](const Coordinates & id)
				213	{
				214	const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
				215	int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				216	int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
				217
				218	a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
				219	a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
				220
				221	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
				222	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
				223	},
				224	input1, input2, output);
				225	}
				226
				227	void sub_wrap_U8_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				228	{
				229	Iterator input1(in1, window);
				230	Iterator input2(in2, window);
				231	Iterator output(out, window);
				232
				233	execute_window_loop(window, [&](const Coordinates & id)
				234	{
				235	const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
				236	int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				237	int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
				238
				239	a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
				240	a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
				241
				242	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
				243	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
				244	},
				245	input1, input2, output);
				246	}
				247
				248	void sub_saturate_U8_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				249	{
				250	Iterator input1(in1, window);
				251	Iterator input2(in2, window);
				252	Iterator output(out, window);
				253
				254	execute_window_loop(window, [&](const Coordinates & id)
				255	{
				256	const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
				257	int16x8_t a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				258	int16x8_t a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
				259
				260	a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
				261	a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
				262
				263	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
				264	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
				265	},
				266	input1, input2, output);
				267	}
				268
				269	void sub_wrap_U8_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				270	{
				271	Iterator input1(in1, window);
				272	Iterator input2(in2, window);
				273	Iterator output(out, window);
				274
				275	execute_window_loop(window, [&](const Coordinates & id)
				276	{
				277	const uint8x16_t av_0 = vld1q_u8(input1.ptr());
				278	const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
				279
				280	const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
				281	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
				282	const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
				283	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
				284
				285	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
				286	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
				287	},
				288	input1, input2, output);
				289	}
				290
				291	void sub_saturate_U8_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				292	{
				293	Iterator input1(in1, window);
				294	Iterator input2(in2, window);
				295	Iterator output(out, window);
				296
				297	execute_window_loop(window, [&](const Coordinates & id)
				298	{
				299	const uint8x16_t av_0 = vld1q_u8(input1.ptr());
				300	const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
				301
				302	const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
				303	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
				304	const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
				305	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
				306
				307	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
				308	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
				309	},
				310	input1, input2, output);
				311	}
				312	} // namespace
				313
				314	NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
				315	: _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
				316	{
				317	}
				318
				319	void NEArithmeticSubtractionKernel::configure(const ITensor input1, const ITensor input2, ITensor *output, ConvertPolicy policy)
				320	{
				321	ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
				322
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	323	// Auto initialize output if not initialized
				324	{
				325	set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	326
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	327	if(input1->info()->data_type() == DataType::S16 \|\| input2->info()->data_type() == DataType::S16)
				328	{
				329	set_format_if_unknown(*output->info(), Format::S16);
				330	}
				331	else if(input1->info()->data_type() == DataType::F32 \|\| input2->info()->data_type() == DataType::F32)
				332	{
				333	set_format_if_unknown(*output->info(), Format::F32);
				334	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	335	}
				336
				337	ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	338	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32);
				339	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32);
				340	ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F32);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	341	ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 \|\| input2->info()->data_type() != DataType::U8),
				342	"Output can only be U8 if both inputs are U8");
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	343	if(is_data_type_fixed_point(input1->info()->data_type()) \|\| is_data_type_fixed_point(input2->info()->data_type()) \|\| is_data_type_fixed_point(output->info()->data_type()))
				344	{
				345	// Check that all data types are the same and all fixed-point positions are the same
				346	ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
				347	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	348
				349	static std::map<std::string, SubFunction *> map_function =
				350	{
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	351	{ "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 },
				352	{ "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 },
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	353	{ "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
				354	{ "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
				355	{ "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
				356	{ "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
				357	{ "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
				358	{ "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
				359	{ "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
				360	{ "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	361	{ "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 },
				362	{ "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 },
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	363	{ "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
				364	{ "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
				365	{ "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
				366	{ "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
				367	};
				368
				369	_input1 = input1;
				370	_input2 = input2;
				371	_output = output;
				372
				373	std::string function_to_call("sub_");
				374	function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
				375	function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
				376	function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
				377	function_to_call += string_from_data_type(output->info()->data_type());
				378
				379	auto it = map_function.find(function_to_call);
				380
				381	if(it != map_function.end())
				382	{
				383	_func = it->second;
				384	}
				385	else
				386	{
				387	ARM_COMPUTE_ERROR("You called subtract with the wrong image formats");
				388	}
				389
				390	constexpr unsigned int num_elems_processed_per_iteration = 16;
				391
				392	// Configure kernel window
				393	Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
				394	AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
				395
				396	update_window_and_padding(win,
				397	AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
				398	AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
				399	output_access);
				400
				401	ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
				402	input2->info()->valid_region());
				403
				404	output_access.set_valid_region(win, valid_region);
				405
				406	INEKernel::configure(win);
				407	}
				408
				409	void NEArithmeticSubtractionKernel::run(const Window &window)
				410	{
				411	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				412	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				413	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				414
				415	(*_func)(_input1, _input2, _output, window);
				416	}