Blame - src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp - ml/ComputeLibrary

blob: 954a2c1754c7e8c30208fdfdc794f9ad6a0f2655 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	2	* Copyright (c) 2016-2018 ARM Limited.
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
				25
Anthony Barbier	eaefd00	2018-07-20 17:49:35 +0100	[diff] [blame]	26	#include "arm_compute/core/CPP/Validate.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	27	#include "arm_compute/core/Error.h"
				28	#include "arm_compute/core/Helpers.h"
				29	#include "arm_compute/core/IAccessWindow.h"
				30	#include "arm_compute/core/ITensor.h"
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	31	#include "arm_compute/core/NEON/NEFixedPoint.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	32	#include "arm_compute/core/TensorInfo.h"
				33	#include "arm_compute/core/Validate.h"
				34
				35	#include <algorithm>
				36	#include <arm_neon.h>
				37	#include <cstdint>
				38	#include <map>
				39	#include <string>
				40
				41	using namespace arm_compute;
				42
				43	namespace arm_compute
				44	{
				45	class Coordinates;
				46	} // namespace arm_compute
				47
				48	namespace
				49	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	50	constexpr unsigned int num_elems_processed_per_iteration = 16;
				51
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	52	void add_wrap_U8_U8_U8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				53	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	54	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				55	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	56	Iterator output(out, window);
				57
				58	execute_window_loop(window, [&](const Coordinates & id)
				59	{
				60	vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
				61	},
				62	input1, input2, output);
				63	}
				64
				65	void add_saturate_U8_U8_U8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				66	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	67	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				68	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	69	Iterator output(out, window);
				70
				71	execute_window_loop(window, [&](const Coordinates & id)
				72	{
				73	vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr())));
				74	},
				75	input1, input2, output);
				76	}
				77
				78	inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
				79	{
				80	const int16x8x2_t res =
				81	{
				82	{
				83	vaddq_s16(a.val[0], b.val[0]),
				84	vaddq_s16(a.val[1], b.val[1])
				85	}
				86	};
				87
				88	return res;
				89	}
				90
				91	inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b)
				92	{
				93	const float32x4x4_t res =
				94	{
				95	{
				96	vaddq_f32(a.val[0], b.val[0]),
				97	vaddq_f32(a.val[1], b.val[1]),
				98	vaddq_f32(a.val[2], b.val[2]),
				99	vaddq_f32(a.val[3], b.val[3])
				100	}
				101	};
				102
				103	return res;
				104	}
				105
				106	inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b)
				107	{
				108	const int16x8x2_t res =
				109	{
				110	{
				111	vqaddq_s16(a.val[0], b.val[0]),
				112	vqaddq_s16(a.val[1], b.val[1])
				113	}
				114	};
				115
				116	return res;
				117	}
				118
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	119	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	120	inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
				121	{
				122	const float16x8x2_t res =
				123	{
				124	{
				125	vaddq_f16(a.val[0], b.val[0]),
				126	vaddq_f16(a.val[1], b.val[1])
				127	}
				128	};
				129
				130	return res;
				131	}
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	132	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	133
				134	void add_F16_F16_F16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				135	{
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	136	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	137	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				138	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	139	Iterator output(out, window);
				140
				141	execute_window_loop(window, [&](const Coordinates & id)
				142	{
				143	const float16x8x2_t a = vld2q_f16(reinterpret_cast<const float16_t *>(input1.ptr()));
				144	const float16x8x2_t b = vld2q_f16(reinterpret_cast<const float16_t *>(input2.ptr()));
				145
				146	vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
				147	},
				148	input1, input2, output);
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	149	#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	150	ARM_COMPUTE_UNUSED(in1);
				151	ARM_COMPUTE_UNUSED(in2);
				152	ARM_COMPUTE_UNUSED(out);
				153	ARM_COMPUTE_UNUSED(window);
				154	ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	155	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	156	}
				157
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	158	void add_F32_F32_F32(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				159	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	160	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				161	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	162	Iterator output(out, window);
				163
				164	execute_window_loop(window, [&](const Coordinates & id)
				165	{
				166	const float32x4x4_t a = vld4q_f32(reinterpret_cast<const float *>(input1.ptr()));
				167	const float32x4x4_t b = vld4q_f32(reinterpret_cast<const float *>(input2.ptr()));
				168
				169	vst4q_f32(reinterpret_cast<float *>(output.ptr()), vadd4q_f32(a, b));
				170	},
				171	input1, input2, output);
				172	}
				173
Georgios Pinitas	a84faff	2018-12-05 18:17:24 +0000	[diff] [blame]	174	void add_QASYMM8_QASYMM8_QASYMM8(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				175	{
				176	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				177	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
				178	Iterator output(out, window);
				179
				180	const float32x4_t vscale1 = vdupq_n_f32(in1->info()->quantization_info().scale);
				181	const float32x4_t vscale2 = vdupq_n_f32(in2->info()->quantization_info().scale);
				182	const float32x4_t invvscaleo = vdupq_n_f32(1.f / out->info()->quantization_info().scale);
				183	const int32x4_t voffset1 = vdupq_n_s32(in1->info()->quantization_info().offset);
				184	const int32x4_t voffset2 = vdupq_n_s32(in2->info()->quantization_info().offset);
				185	const float32x4_t voffseto = vdupq_n_f32(out->info()->quantization_info().offset);
				186
				187	execute_window_loop(window, [&](const Coordinates & id)
				188	{
				189	const uint8x16_t a = vld1q_u8(input1.ptr());
				190	const uint8x16_t b = vld1q_u8(input2.ptr());
				191
				192	const float32x4x4_t af =
				193	{
				194	{
				195	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
				196	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1),
				197	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
				198	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1),
				199	}
				200	};
				201
				202	const float32x4x4_t bf =
				203	{
				204	{
				205	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
				206	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2),
				207	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
				208	vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2),
				209	}
				210	};
				211
				212	const int32x4x4_t rf =
				213	{
				214	{
				215	vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[0], bf.val[0]), invvscaleo)),
				216	vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[1], bf.val[1]), invvscaleo)),
				217	vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[2], bf.val[2]), invvscaleo)),
				218	vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af.val[3], bf.val[3]), invvscaleo)),
				219	}
				220	};
				221
				222	const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
				223	const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
				224	vst1q_u8(output.ptr(), vcombine_u8(pa, pb));
				225	},
				226	input1, input2, output);
				227	}
				228
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	229	void add_wrap_S16_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				230	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	231	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				232	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	233	Iterator output(out, window);
				234
				235	execute_window_loop(window, [&](const Coordinates & id)
				236	{
				237	const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				238	const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				239
				240	vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vadd2q_s16(a, b));
				241	},
				242	input1, input2, output);
				243	}
				244
				245	void add_saturate_S16_S16_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				246	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	247	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				248	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	249	Iterator output(out, window);
				250
				251	execute_window_loop(window, [&](const Coordinates & id)
				252	{
				253	const int16x8x2_t a = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
				254	const int16x8x2_t b = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
				255
				256	vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqadd2q_s16(a, b));
				257	},
				258	input1, input2, output);
				259	}
				260
				261	void add_wrap_S16_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				262	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	263	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				264	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	265	Iterator output(out, window);
				266
				267	execute_window_loop(window, [&](const Coordinates & id)
				268	{
				269	const int16x8x2_t a =
				270	{
				271	{
				272	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
				273	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
				274	}
				275	};
				276	const uint8x16_t b = vld1q_u8(input2.ptr());
				277
				278	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
				279	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
				280	},
				281	input1, input2, output);
				282	}
				283
				284	void add_saturate_S16_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				285	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	286	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				287	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	288	Iterator output(out, window);
				289
				290	execute_window_loop(window, [&](const Coordinates & id)
				291	{
				292	const int16x8x2_t a =
				293	{
				294	{
				295	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
				296	vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8)
				297	}
				298	};
				299	const uint8x16_t b = vld1q_u8(input2.ptr());
				300
				301	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)))));
				302	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))));
				303	},
				304	input1, input2, output);
				305	}
				306
				307	inline void add_wrap_U8_S16_S16(const ITensor input1, const ITensor input2, ITensor *output, const Window &window)
				308	{
				309	//Simply swap the two input buffers:
				310	add_wrap_S16_U8_S16(input2, input1, output, window);
				311	}
				312
				313	inline void add_saturate_U8_S16_S16(const ITensor input1, const ITensor input2, ITensor *output, const Window &window)
				314	{
				315	//Simply swap the two input buffers:
				316	add_saturate_S16_U8_S16(input2, input1, output, window);
				317	}
				318
				319	void add_wrap_U8_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				320	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	321	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				322	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	323	Iterator output(out, window);
				324
				325	execute_window_loop(window, [&](const Coordinates & id)
				326	{
				327	const uint8x16_t a = vld1q_u8(input1.ptr());
				328	const uint8x16_t b = vld1q_u8(input2.ptr());
				329
				330	const int16x8x2_t a_s16 =
				331	{
				332	{
				333	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
				334	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
				335	}
				336	};
				337
				338	const int16x8x2_t b_s16 =
				339	{
				340	{
				341	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
				342	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
				343	}
				344	};
				345
				346	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0]));
				347	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1]));
				348	},
				349	input1, input2, output);
				350	}
				351
				352	void add_saturate_U8_U8_S16(const ITensor in1, const ITensor in2, ITensor *out, const Window &window)
				353	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	354	Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
				355	Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	356	Iterator output(out, window);
				357
				358	execute_window_loop(window, [&](const Coordinates & id)
				359	{
				360	const uint8x16_t a = vld1q_u8(input1.ptr());
				361	const uint8x16_t b = vld1q_u8(input2.ptr());
				362
				363	const int16x8x2_t a_s16 =
				364	{
				365	{
				366	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
				367	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)))
				368	}
				369	};
				370
				371	const int16x8x2_t b_s16 =
				372	{
				373	{
				374	vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))),
				375	vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)))
				376	}
				377	};
				378
				379	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0]));
				380	vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1]));
				381	},
				382	input1, input2, output);
				383	}
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	384
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	385	Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	386	{
				387	ARM_COMPUTE_UNUSED(policy);
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	388
Anthony Barbier	eaefd00	2018-07-20 17:49:35 +0100	[diff] [blame]	389	ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
Georgios Pinitas	a84faff	2018-12-05 18:17:24 +0000	[diff] [blame]	390	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
				391	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	392
				393	const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
				394
				395	ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
				396
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	397	// Validate in case of configured output
				398	if(output.total_size() > 0)
				399	{
				400	ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Vidhya Sudhan Loganathan	7485d5a	2018-07-04 09:34:00 +0100	[diff] [blame]	401	!(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	402	&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
				403	&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
				404	&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	405	&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
				406	&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
Georgios Pinitas	a84faff	2018-12-05 18:17:24 +0000	[diff] [blame]	407	&& !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
				408	&& !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8),
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	409	"You called addition with the wrong image formats");
				410
				411	ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
				412	"Wrong shape for output");
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	413	}
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	414
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	415	return Status{};
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	416	}
				417
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	418	std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	419	{
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	420	const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
				421	const TensorShape &out_shape = broadcast_pair.first;
				422	const ValidRegion &valid_region = broadcast_pair.second;
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	423
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	424	// Auto initialize output if not initialized
				425	{
				426	set_shape_if_empty(output, out_shape);
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	427
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	428	if(input1.data_type() == DataType::S16 \|\| input2.data_type() == DataType::S16)
				429	{
				430	set_format_if_unknown(output, Format::S16);
				431	}
				432	else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
				433	{
				434	set_format_if_unknown(output, Format::F16);
				435	}
				436	else if(input1.data_type() == DataType::F32 \|\| input2.data_type() == DataType::F32)
				437	{
				438	set_format_if_unknown(output, Format::F32);
				439	}
Georgios Pinitas	a84faff	2018-12-05 18:17:24 +0000	[diff] [blame]	440	else if(input1.data_type() == DataType::QASYMM8 \|\| input2.data_type() == DataType::QASYMM8)
				441	{
				442	set_data_type_if_unknown(output, DataType::QASYMM8);
				443	}
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	444	}
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	445
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	446	Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
				447	Window win_input1 = win.broadcast_if_dimension_le_one(input1);
				448	Window win_input2 = win.broadcast_if_dimension_le_one(input2);
				449
				450	AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
				451	AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
				452	AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
				453
				454	bool window_changed = update_window_and_padding(win_input1, input1_access)
				455	\|\| update_window_and_padding(win_input2, input2_access)
				456	\|\| update_window_and_padding(win, output_access);
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	457
				458	output_access.set_valid_region(win, valid_region);
				459
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	460	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	461	return std::make_pair(err, win);
				462	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	463	} // namespace
				464
				465	NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
				466	: _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
				467	{
				468	}
				469
				470	void NEArithmeticAdditionKernel::configure(const ITensor input1, const ITensor input2, ITensor *output, ConvertPolicy policy)
				471	{
				472	ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	473	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), *output->info(), policy));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	474
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	475	// Configure kernel window
				476	auto win_config = validate_and_configure_window(input1->info(), input2->info(), *output->info());
				477	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	478
				479	static std::map<std::string, AddFunction *> map_function =
				480	{
				481	{ "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
				482	{ "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
				483	{ "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
				484	{ "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 },
				485	{ "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 },
				486	{ "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
				487	{ "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
				488	{ "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
				489	{ "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
				490	{ "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
				491	{ "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
				492	{ "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
Pablo Tello	d1b0ecc	2017-07-11 11:27:04 +0100	[diff] [blame]	493	{ "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
				494	{ "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
Georgios Pinitas	a84faff	2018-12-05 18:17:24 +0000	[diff] [blame]	495	{ "add_wrap_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
				496	{ "add_saturate_QASYMM8_QASYMM8_QASYMM8", &add_QASYMM8_QASYMM8_QASYMM8 },
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	497	};
				498
				499	_input1 = input1;
				500	_input2 = input2;
				501	_output = output;
				502
				503	std::string function_to_call("add_");
				504	function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
				505	function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
				506	function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
				507	function_to_call += string_from_data_type(output->info()->data_type());
				508
				509	auto it = map_function.find(function_to_call);
				510
				511	if(it != map_function.end())
				512	{
				513	_func = it->second;
				514	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	515
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	516	INEKernel::configure(win_config.second);
				517	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	518
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	519	Status NEArithmeticAdditionKernel::validate(const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output, ConvertPolicy policy)
Ioan-Cristian Szabo	397d58a	2017-11-30 15:19:11 +0000	[diff] [blame]	520	{
Georgios Pinitas	cbf39c6	2018-09-10 15:07:45 +0100	[diff] [blame]	521	ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	522
				523	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, *output, policy));
				524	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone(), input2->clone(), *output->clone()).first);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	525
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame]	526	return Status{};
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	527	}
				528
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	529	void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	530	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	531	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	532	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				533	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				534	ARM_COMPUTE_ERROR_ON(_func == nullptr);
				535
				536	(*_func)(_input1, _input2, _output, window);
				537	}
Diego Lopez Recas	0021d75	2017-12-18 14:42:56 +0000	[diff] [blame]	538
				539	BorderSize NEArithmeticAdditionKernel::border_size() const
				540	{
				541	const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
				542	const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
				543	return BorderSize(0, border, 0, 0);
				544	}