Blame - src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp - ml/ComputeLibrary

blob: c271032e54befe4ca5e0dd9e124f634709398502 [file] [log] [blame]

Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2016, 2017 ARM Limited.
				3	*
				4	* SPDX-License-Identifier: MIT
				5	*
				6	* Permission is hereby granted, free of charge, to any person obtaining a copy
				7	* of this software and associated documentation files (the "Software"), to
				8	* deal in the Software without restriction, including without limitation the
				9	* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
				10	* sell copies of the Software, and to permit persons to whom the Software is
				11	* furnished to do so, subject to the following conditions:
				12	*
				13	* The above copyright notice and this permission notice shall be included in all
				14	* copies or substantial portions of the Software.
				15	*
				16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				19	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				21	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				22	* SOFTWARE.
				23	*/
				24	#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
				25
				26	#include "arm_compute/core/Error.h"
				27	#include "arm_compute/core/Helpers.h"
				28	#include "arm_compute/core/IAccessWindow.h"
				29	#include "arm_compute/core/ITensor.h"
				30	#include "arm_compute/core/NEON/NEFixedPoint.h"
				31	#include "arm_compute/core/TensorInfo.h"
				32	#include "arm_compute/core/Validate.h"
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	33
				34	#include <arm_neon.h>
				35	#include <climits>
				36	#include <cmath>
				37	#include <cstdint>
				38	#include <cstdlib>
				39
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	40	#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	41	#include <arm_fp16.h> // needed for float16_t
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	42	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	43
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	44	using namespace arm_compute;
				45
				46	namespace arm_compute
				47	{
				48	class Coordinates;
				49	} // namespace arm_compute
				50
				51	namespace
				52	{
				53	const float scale255_constant = 1.f / 255.f;
				54	const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
				55	const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
				56
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame^]	57	inline Status validate_arguments(const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	58	{
				59	ARM_COMPUTE_UNUSED(overflow_policy);
				60	ARM_COMPUTE_UNUSED(rounding_policy);
				61
				62	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
				63	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
				64	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
				65	ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
				66	ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 \|\| input2->data_type() != DataType::U8),
				67	"Output can only be U8 if both inputs are U8");
				68
				69	if(is_data_type_fixed_point(input1->data_type()) \|\| is_data_type_fixed_point(input2->data_type()) \|\| is_data_type_fixed_point(output->data_type()))
				70	{
				71	// Check that all data types are the same and all fixed-point positions are the same
				72	ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
				73	// Check if scale is representable in fixed-point with the provided settings
				74	ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
				75	}
				76
				77	if(std::abs(scale - scale255_constant) < 0.00001f)
				78	{
				79	ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
				80	}
				81	else
				82	{
				83	ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
				84
				85	int exponent = 0;
				86	const float normalized_mantissa = std::frexp(scale, &exponent);
				87
				88	// Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
				89	// frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
				90	// Moreover, it will be negative as we deal with 1/2^n
				91	ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
				92	}
				93
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame^]	94	return Status{};
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	95	}
				96
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame^]	97	inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo input1, ITensorInfo input2, ITensorInfo *output)
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	98	{
				99	constexpr unsigned int num_elems_processed_per_iteration = 16;
				100
				101	// Configure kernel window
				102	Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
				103	AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
				104
				105	bool window_changed = update_window_and_padding(win,
				106	AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
				107	AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
				108	output_access);
				109
				110	ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
				111	input2->valid_region());
				112
				113	output_access.set_valid_region(win, valid_region);
				114
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame^]	115	Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	116	return std::make_pair(err, win);
				117	}
				118
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	119	/* Scales a given vector by 1/255.
				120	*
				121	* @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
				122	*
				123	* @param in Input vector to scale.
				124	* @return Scaled output rounded to nearest (round half up).
				125	*/
				126	inline int32x4_t scale255_S32_S32(int32x4_t in)
				127	{
				128	// Scale
				129	const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
				130	// Round to nearest (round half up)
				131	// Add +0.5 for all values
				132	// Afterwards vcvt rounds toward zero
				133	return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
				134	}
				135
				136	inline uint16x8_t scale255_U16_U16(uint16x8_t in)
				137	{
				138	const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
				139	const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
				140	return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
				141	}
				142
				143	template <bool is_scale255, bool is_sat>
				144	void mul_U8_U8_U8_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n)
				145	{
				146	const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
				147	const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
				148	const auto output = static_cast<uint8_t *__restrict>(output_ptr);
				149
				150	const uint8x16_t ta1 = vld1q_u8(input1);
				151	const uint8x16_t ta2 = vld1q_u8(input2);
				152
				153	uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
				154	const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
				155	uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
				156	const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
				157
				158	tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
				159	tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
				160
				161	if(is_scale255)
				162	{
				163	tmp1_high = scale255_U16_U16(tmp1_high);
				164	tmp1_low = scale255_U16_U16(tmp1_low);
				165	}
				166	else
				167	{
				168	const int16x8_t vn = vdupq_n_s16(-n);
				169
				170	if(is_sat)
				171	{
				172	tmp1_high = vqshlq_u16(tmp1_high, vn);
				173	tmp1_low = vqshlq_u16(tmp1_low, vn);
				174	}
				175	else
				176	{
				177	tmp1_high = vshlq_u16(tmp1_high, vn);
				178	tmp1_low = vshlq_u16(tmp1_low, vn);
				179	}
				180	}
				181
				182	if(is_sat)
				183	{
				184	vst1q_u8(output, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
				185	}
				186	else
				187	{
				188	vst1q_u8(output, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
				189	}
				190	}
				191
				192	template <bool is_scale255, bool is_sat>
				193	void mul_QS8_QS8_QS8_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
				194	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	195	const auto output = static_cast<qint8_t *__restrict>(output_ptr);
				196
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	197	const qint8x16_t ta1 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input1_ptr));
				198	const qint8x16_t ta2 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input2_ptr));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	199
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	200	if(is_scale255)
				201	{
				202	qint16x8_t tmp1_high = vmovl_s8(vget_high_s8(ta1));
				203	qint16x8_t tmp1_low = vmovl_s8(vget_low_s8(ta1));
				204	const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2));
				205	const qint16x8_t tmp2_low = vmovl_s8(vget_low_s8(ta2));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	206
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	207	const float32x4x2_t scale255_f32 =
				208	{
				209	{
				210	scale255_constant_f32q,
				211	scale255_constant_f32q
				212	}
				213	};
				214	const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
				215
				216	tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position);
				217	tmp1_low = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position);
				218	tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position);
				219	tmp1_low = vmulq_qs16(tmp1_low, scale255, fixed_point_position);
				220
				221	if(is_sat)
				222	{
				223	vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high)));
				224	}
				225	else
				226	{
				227	vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high)));
				228	}
				229	}
				230	else
				231	{
				232	const qint8x16_t vn = vdupq_n_s8(-n);
				233	qint8x16_t res = ta2;
				234
				235	if(is_sat)
				236	{
				237	res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn);
				238	}
				239	else
				240	{
				241	res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn);
				242	}
				243	vst1q_qs8(output, res);
				244	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	245	}
				246
				247	template <bool is_scale255, bool is_sat>
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	248	void mul_QS16_QS16_QS16_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
				249	{
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	250	const qint16x8x2_t ta1 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input1_ptr));
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	251	qint16x8x2_t res = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	252
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	253	if(is_scale255)
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	254	{
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	255	const float32x4x2_t scale255_f32 =
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	256	{
				257	{
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	258	scale255_constant_f32q,
				259	scale255_constant_f32q
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	260	}
				261	};
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	262	const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
				263	if(is_sat)
				264	{
				265	res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
				266	res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
				267	}
				268	else
				269	{
				270	res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
				271	res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
				272	}
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	273	}
				274	else
				275	{
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	276	const qint16x8_t vn = vdupq_n_s16(-n);
				277	if(is_sat)
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	278	{
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	279	res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
				280	res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
				281	}
				282	else
				283	{
				284	res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
				285	res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
				286	}
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	287	}
Michele Di Giorgio	1b80b6c	2017-07-17 15:06:34 +0100	[diff] [blame]	288	vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	289	}
				290
				291	template <bool is_scale255, bool is_sat>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	292	inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
				293	{
				294	int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
				295	const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
				296	int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));
				297	const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));
				298
				299	tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
				300	tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
				301
				302	if(is_scale255)
				303	{
				304	tmp1_high = scale255_S32_S32(tmp1_high);
				305	tmp1_low = scale255_S32_S32(tmp1_low);
				306	}
				307	else
				308	{
				309	// Right shift amount
				310	const int32x4_t vn = vdupq_n_s32(-n);
				311	// Left shift amount
				312	const int32x4_t vnl = vdupq_n_s32(n);
				313	// Calculate conversion bit
				314	const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
				315	const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
				316	const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
				317	const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
				318	const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
				319	const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
				320	const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
				321	const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
				322	if(is_sat)
				323	{
				324	tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
				325	tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
				326	}
				327	else
				328	{
				329	tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
				330	tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
				331	}
				332	}
				333
				334	if(is_sat)
				335	{
				336	return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
				337	}
				338	else
				339	{
				340	return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
				341	}
				342	}
				343
				344	template <bool is_scale255, bool is_sat>
				345	inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
				346	{
				347	const int16x8x2_t result =
				348	{
				349	{
				350	// First 8 elements
				351	mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
				352	// Second 8 elements
				353	mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
				354	}
				355	};
				356
				357	return result;
				358	}
				359
				360	template <bool is_scale255, bool is_sat>
				361	void mul_S16_S16_S16_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n)
				362	{
				363	const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
				364	const auto input2 = static_cast<const int16_t *__restrict>(input2_ptr);
				365	const auto output = static_cast<int16_t *__restrict>(output_ptr);
				366
				367	const int16x8x2_t ta1 = vld2q_s16(input1);
				368	const int16x8x2_t ta2 = vld2q_s16(input2);
				369	const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
				370
				371	vst2q_s16(output, result);
				372	}
				373
				374	template <bool is_scale255, bool is_sat>
				375	void mul_F32_F32_F32_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, float scale)
				376	{
				377	const auto input1 = static_cast<const float *__restrict>(input1_ptr);
				378	const auto input2 = static_cast<const float *__restrict>(input2_ptr);
				379	const auto output = static_cast<float *__restrict>(output_ptr);
				380
				381	const float32x4x4_t ta1 = vld4q_f32(input1);
				382	const float32x4x4_t ta2 = vld4q_f32(input2);
				383	const float32x4_t scale_vec = vdupq_n_f32(scale);
				384	const float32x4x4_t result =
				385	{
				386	{
				387	vmulq_f32(vmulq_f32(ta1.val[0], ta2.val[0]), scale_vec),
				388	vmulq_f32(vmulq_f32(ta1.val[1], ta2.val[1]), scale_vec),
				389	vmulq_f32(vmulq_f32(ta1.val[2], ta2.val[2]), scale_vec),
				390	vmulq_f32(vmulq_f32(ta1.val[3], ta2.val[3]), scale_vec)
				391	}
				392	};
				393	vst4q_f32(output, result);
				394	}
				395
				396	template <bool is_scale255, bool is_sat>
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	397	void mul_F16_F16_F16_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, float scale)
				398	{
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	399	#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	400	const auto input1 = static_cast<const float16_t *__restrict>(input1_ptr);
				401	const auto input2 = static_cast<const float16_t *__restrict>(input2_ptr);
				402	const auto output = static_cast<float16_t *__restrict>(output_ptr);
				403	const float16x8x2_t ta1 = vld2q_f16(input1);
				404	const float16x8x2_t ta2 = vld2q_f16(input2);
				405	const float16x8_t scale_vec = vdupq_n_f16(scale);
				406	const float16x8x2_t result =
				407	{
				408	{
				409	vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
				410	vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
				411	}
				412	};
				413	vst2q_f16(output, result);
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	414	#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Georgios Pinitas	30f0215	2017-09-27 11:20:48 +0100	[diff] [blame]	415	ARM_COMPUTE_UNUSED(input1_ptr);
				416	ARM_COMPUTE_UNUSED(input2_ptr);
				417	ARM_COMPUTE_UNUSED(output_ptr);
				418	ARM_COMPUTE_UNUSED(scale);
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	419	ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a.");
Ioan-Cristian Szabo	5edbd1c	2017-11-13 13:34:08 +0000	[diff] [blame]	420	#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	421	}
				422
				423	template <bool is_scale255, bool is_sat>
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	424	void mul_U8_U8_S16_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n)
				425	{
				426	const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
				427	const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
				428	const auto output = static_cast<int16_t *__restrict>(output_ptr);
				429
				430	const uint8x16_t bv = vld1q_u8(input2);
				431	const uint8x16_t av = vld1q_u8(input1);
				432
				433	uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
				434	uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
				435	tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
				436	tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
				437
				438	if(is_scale255)
				439	{
				440	tmp_low = scale255_U16_U16(tmp_low);
				441	tmp_high = scale255_U16_U16(tmp_high);
				442	}
				443	else
				444	{
				445	const int16x8_t vn = vdupq_n_s16(-n);
				446
				447	if(is_sat)
				448	{
				449	tmp_low = vqshlq_u16(tmp_low, vn);
				450	tmp_high = vqshlq_u16(tmp_high, vn);
				451	}
				452	else
				453	{
				454	tmp_low = vshlq_u16(tmp_low, vn);
				455	tmp_high = vshlq_u16(tmp_high, vn);
				456	}
				457	}
				458
				459	if(is_sat)
				460	{
				461	static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
				462
				463	tmp_low = vminq_u16(tmp_low, max);
				464	tmp_high = vminq_u16(tmp_high, max);
				465	}
				466
				467	vst1q_s16(output, vreinterpretq_s16_u16(tmp_low));
				468	vst1q_s16(output + 8, vreinterpretq_s16_u16(tmp_high));
				469	}
				470
				471	template <bool is_scale255, bool is_sat>
				472	void mul_S16_U8_S16_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n)
				473	{
				474	const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
				475	const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
				476	const auto output = static_cast<int16_t *__restrict>(output_ptr);
				477
				478	const int16x8x2_t ta1 = vld2q_s16(input1);
				479	const uint8x8x2_t ta2u = vld2_u8(input2);
				480	const int16x8x2_t ta2 =
				481	{
				482	{
				483	vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
				484	vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
				485	}
				486	};
				487
				488	const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
				489
				490	vst2q_s16(output, result);
				491	}
				492
				493	template <bool is_scale255, bool is_sat>
				494	void mul_U8_S16_S16_n(const void __restrict input1_ptr, const void __restrict input2_ptr, void *__restrict output_ptr, int n)
				495	{
				496	// Simply swap the two input buffers
				497	mul_S16_U8_S16_n<is_scale255, is_sat>(input2_ptr, input1_ptr, output_ptr, n);
				498	}
				499	} // namespace
				500
				501	NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
				502	: _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
				503	{
				504	}
				505
				506	void NEPixelWiseMultiplicationKernel::configure(const ITensor input1, const ITensor input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
				507	{
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	508	ARM_COMPUTE_UNUSED(rounding_policy);
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	509	ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
				510
				511	// Auto initialize output if not initialized
				512	{
				513	set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
				514
				515	if(input1->info()->data_type() == DataType::S16 \|\| input2->info()->data_type() == DataType::S16)
				516	{
				517	set_format_if_unknown(*output->info(), Format::S16);
				518	}
				519	else if(input1->info()->data_type() == DataType::F32 \|\| input2->info()->data_type() == DataType::F32)
				520	{
				521	set_format_if_unknown(*output->info(), Format::F32);
				522	}
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	523	else if(input1->info()->data_type() == DataType::F16 \|\| input2->info()->data_type() == DataType::F16)
				524	{
				525	set_format_if_unknown(*output->info(), Format::F16);
				526	}
Georgios Pinitas	f0dea70	2017-07-03 18:17:28 +0100	[diff] [blame]	527	else if(input1->info()->data_type() == DataType::QS8 && input2->info()->data_type() == DataType::QS8)
				528	{
				529	set_data_type_if_unknown(*output->info(), DataType::QS8);
				530	set_fixed_point_position_if_zero(*output->info(), input1->info()->fixed_point_position());
				531	}
				532	}
				533
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	534	ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	535
				536	_input1 = input1;
				537	_input2 = input2;
				538	_output = output;
				539	_scale = scale;
				540	_scale_exponent = 0;
				541	_func_int = nullptr;
				542	_func_q_int = nullptr;
				543	_func_float = nullptr;
				544
				545	bool is_scale_255 = false;
				546	// Check and validate scaling factor
				547	if(std::abs(scale - scale255_constant) < 0.00001f)
				548	{
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	549	is_scale_255 = true;
				550	}
				551	else
				552	{
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	553	int exponent = 0;
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	554
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	555	std::frexp(scale, &exponent);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	556
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	557	// Store the positive exponent. We know that we compute 1/2^n
				558	// Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
				559	_scale_exponent = std::abs(exponent - 1);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	560	}
				561
				562	const DataType dt_input1 = input1->info()->data_type();
				563	const DataType dt_input2 = input2->info()->data_type();
				564	const DataType dt_output = output->info()->data_type();
				565	const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
				566
				567	if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
				568	{
				569	if(is_scale_255)
				570	{
				571	_func_int = is_sat ? &mul_U8_U8_U8_n<true, true> : &mul_U8_U8_U8_n<true, false>;
				572	}
				573	else
				574	{
				575	_func_int = is_sat ? &mul_U8_U8_U8_n<false, true> : &mul_U8_U8_U8_n<false, false>;
				576	}
				577	}
				578	else if(DataType::S16 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
				579	{
				580	if(is_scale_255)
				581	{
				582	_func_int = is_sat ? &mul_S16_S16_S16_n<true, true> : &mul_S16_S16_S16_n<true, false>;
				583	}
				584	else
				585	{
				586	_func_int = is_sat ? &mul_S16_S16_S16_n<false, true> : &mul_S16_S16_S16_n<false, false>;
				587	}
				588	}
				589	else if(DataType::S16 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
				590	{
				591	if(is_scale_255)
				592	{
				593	_func_int = is_sat ? &mul_S16_U8_S16_n<true, true> : &mul_S16_U8_S16_n<true, false>;
				594	}
				595	else
				596	{
				597	_func_int = is_sat ? &mul_S16_U8_S16_n<false, true> : &mul_S16_U8_S16_n<false, false>;
				598	}
				599	}
				600	else if(DataType::U8 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
				601	{
				602	if(is_scale_255)
				603	{
				604	_func_int = is_sat ? &mul_U8_S16_S16_n<true, true> : &mul_U8_S16_S16_n<true, false>;
				605	}
				606	else
				607	{
				608	_func_int = is_sat ? &mul_U8_S16_S16_n<false, true> : &mul_U8_S16_S16_n<false, false>;
				609	}
				610	}
				611	else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
				612	{
				613	if(is_scale_255)
				614	{
				615	_func_int = is_sat ? &mul_U8_U8_S16_n<true, true> : &mul_U8_U8_S16_n<true, false>;
				616	}
				617	else
				618	{
				619	_func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
				620	}
				621	}
				622	else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
				623	{
				624	if(is_scale_255)
				625	{
				626	_func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
				627	}
				628	else
				629	{
				630	_func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
				631	}
				632	}
Michele Di Giorgio	81f0d15	2017-07-11 15:00:52 +0100	[diff] [blame]	633	else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output)
				634	{
				635	if(is_scale_255)
				636	{
				637	_func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<true, true> : &mul_QS16_QS16_QS16_n<true, false>;
				638	}
				639	else
				640	{
				641	_func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<false, true> : &mul_QS16_QS16_QS16_n<false, false>;
				642	}
				643	}
Pablo Tello	df24618	2017-07-03 16:25:09 +0100	[diff] [blame]	644	else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
				645	{
				646	_func_float = &mul_F16_F16_F16_n<false, false>;
				647	_func_int = nullptr;
				648	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	649	else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
				650	{
				651	_func_float = &mul_F32_F32_F32_n<false, false>;
				652	_func_int = nullptr;
				653	}
				654	else
				655	{
				656	ARM_COMPUTE_ERROR("You called with the wrong img formats");
				657	}
				658
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	659	// Configure kernel window
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	660	auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
				661	ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
				662	INEKernel::configure(win_config.second);
				663	}
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	664
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame^]	665	Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo input1, const ITensorInfo input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
				666	RoundingPolicy rounding_policy)
Ioan-Cristian Szabo	754e952	2017-11-28 18:29:43 +0000	[diff] [blame]	667	{
				668	ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
				669	ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	670
Georgios Pinitas	631c41a	2017-12-06 11:53:03 +0000	[diff] [blame^]	671	return Status{};
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	672	}
				673
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	674	void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	675	{
Moritz Pflanzer	c186b57	2017-09-07 09:48:04 +0100	[diff] [blame]	676	ARM_COMPUTE_UNUSED(info);
Anthony Barbier	6ff3b19	2017-09-04 18:44:23 +0100	[diff] [blame]	677	ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
				678	ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
				679
				680	Iterator input1(_input1, window);
				681	Iterator input2(_input2, window);
				682	Iterator output(_output, window);
				683
				684	if(_func_int != nullptr)
				685	{
				686	execute_window_loop(window, [&](const Coordinates & id)
				687	{
				688	(*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
				689	},
				690	input1, input2, output);
				691	}
				692	else if(_func_q_int != nullptr)
				693	{
				694	int fixed_point_position = _input1->info()->fixed_point_position();
				695	execute_window_loop(window, [&](const Coordinates & id)
				696	{
				697	(*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
				698	},
				699	input1, input2, output);
				700	}
				701	else
				702	{
				703	ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
				704	execute_window_loop(window, [&](const Coordinates & id)
				705	{
				706	(*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
				707	},
				708	input1, input2, output);
				709	}
				710	}