blob: 0847cb1f23af280b92c0c60103a2e30855df453b [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2016-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
25
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026#include "arm_compute/core/TensorInfo.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010027#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010028#include "src/core/NEON/NEAsymm.h"
29#include "src/core/NEON/NESymm.h"
30#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010031#include "src/core/helpers/AutoConfiguration.h"
32#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033
34#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +000036#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellodf246182017-07-03 16:25:09 +010037#include <arm_fp16.h> // needed for float16_t
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +000038#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +010039
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040namespace arm_compute
41{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042namespace
43{
44const float scale255_constant = 1.f / 255.f;
45const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
46const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
47
Georgios Pinitas631c41a2017-12-06 11:53:03 +000048inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000049{
50 ARM_COMPUTE_UNUSED(overflow_policy);
51 ARM_COMPUTE_UNUSED(rounding_policy);
52
Anthony Barbiereaefd002018-07-20 17:49:35 +010053 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
SiCong Libb88f892020-08-28 11:18:47 +010054 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
55 DataType::F32);
56 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
57 DataType::F32);
Michele Di Giorgio9428a182020-03-30 14:10:20 +010058 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
59 DataType::S16, DataType::QSYMM16,
60 DataType::S32, DataType::F16, DataType::F32);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000061 if(is_data_type_quantized(input1->data_type()) || is_data_type_quantized(input2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000062 {
63 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000064 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000065 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000066
67 if(output->total_size() > 0)
68 {
Manuel Bottini79fa9a22019-02-22 17:54:22 +000069 const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
70 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
71 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010072 // clang-format off
73 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
74 !(input1->data_type() == input2->data_type() && input2->data_type() == output->data_type()) &&
75 !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
76 !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16) &&
77 !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
78 !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
79 !(input1->data_type() == DataType::QSYMM16 && input2->data_type() == DataType::QSYMM16 && output->data_type() == DataType::S32)
80 , "Invalid data type combination");
81 // clang-format on
82 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S16 && output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000083 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000084
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000085 if(std::abs(scale - scale255_constant) < 0.00001f)
86 {
87 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
SiCong Libb88f892020-08-28 11:18:47 +010088 ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S32 && input2->data_type() == DataType::S32 && output->data_type() == DataType::S32,
89 "Scale == 1/255 is not supported if input and output are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000090 }
91 else
92 {
93 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
94
95 int exponent = 0;
96 const float normalized_mantissa = std::frexp(scale, &exponent);
97
98 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
99 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
100 // Moreover, it will be negative as we deal with 1/2^n
101 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
102 }
103
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000104 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000105}
106
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100107/* Scales a given vector by 1/255.
108 *
109 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
110 *
111 * @param in Input vector to scale.
112 * @return Scaled output rounded to nearest (round half up).
113 */
114inline int32x4_t scale255_S32_S32(int32x4_t in)
115{
116 // Scale
117 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
118 // Round to nearest (round half up)
119 // Add +0.5 for all values
120 // Afterwards vcvt rounds toward zero
121 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
122}
123
124inline uint16x8_t scale255_U16_U16(uint16x8_t in)
125{
126 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
127 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
128 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
129}
130
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100131template <typename T>
132inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
133vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000134{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100135 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000136}
137
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100138template <typename T>
139inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
140vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000141{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100142 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000143}
144
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100145template <typename T>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100146void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
147{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100148 // Create input windows
149 Window win = window;
150 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
151 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
152
153 // Clear X Dimension on execution window as we handle manually
154 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100155
Sheri Zhanga449a362020-07-16 15:52:25 +0100156 const int window_step_x = 16 / sizeof(T);
157 const auto window_start_x = static_cast<int>(window.x().start());
158 const auto window_end_x = static_cast<int>(window.x().end());
159 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100160
Sheri Zhanga449a362020-07-16 15:52:25 +0100161 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
162 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100163
Sheri Zhanga449a362020-07-16 15:52:25 +0100164 if(is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100165 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100166 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
167 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
168 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
169 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
170 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
171 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
172 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100173
Sheri Zhanga449a362020-07-16 15:52:25 +0100174 // Clear X Dimension on execution window as we handle manually
175 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
176
177 Iterator broadcast_input(broadcast_tensor, broadcast_win);
178 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
179 Iterator output(out, win);
180
181 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
182
183 execute_window_loop(win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100184 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100185 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
186 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100187
Sheri Zhanga449a362020-07-16 15:52:25 +0100188 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
189 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100190
Sheri Zhanga449a362020-07-16 15:52:25 +0100191 // Compute window_step_x elements per iteration
192 int x = window_start_x;
193 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100194 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100195 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100196
Sheri Zhanga449a362020-07-16 15:52:25 +0100197 // Dequantize inputs
198 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
199 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100200
Sheri Zhanga449a362020-07-16 15:52:25 +0100201 const float32x4x4_t out_f32x4x4 =
202 {
203 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
204 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
205 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
206 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
207 };
208
209 // Quantize output
210 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
211 wrapper::vstore(output_ptr + x, result);
212 }
213
214 // Compute left-over elements
215 for(; x < window_end_x; ++x)
216 {
217 // Dequantize inputs
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100218 const T in1 = *(non_broadcast_input_ptr + x);
219 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, non_broadcast_qinfo);
220 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
221 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100222
223 // Quantize output
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100224 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100225 *(output_ptr + x) = tmp_qua;
226 }
227 },
228 broadcast_input, non_broadcast_input, output);
229 }
230 else
231 {
232 const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
233 const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
234
235 // Clear X Dimension on execution window as we handle manually
236 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
237 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
238
239 Iterator input1(in1, input1_win);
240 Iterator input2(in2, input2_win);
241 Iterator output(out, win);
242
243 execute_window_loop(win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100244 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100245 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
246 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
247 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100248
Sheri Zhanga449a362020-07-16 15:52:25 +0100249 // Compute window_step_x elements per iteration
250 int x = window_start_x;
251 for(; x <= (window_end_x - window_step_x); x += window_step_x)
252 {
253 const auto input1_q = wrapper::vloadq(input1_ptr + x);
254 const auto input2_q = wrapper::vloadq(input2_ptr + x);
255
256 // Dequantize inputs
257 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
258 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
259
260 const float32x4x4_t out_f32x4x4 =
261 {
262 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
263 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
264 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
265 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
266 };
267
268 // Quantize output
269 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
270 wrapper::vstore(output_ptr + x, result);
271 }
272
273 // Compute left-over elements
274 for(; x < window_end_x; ++x)
275 {
276 // Dequantize inputs
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100277 const T in1 = *(input1_ptr + x);
278 const T in2 = *(input2_ptr + x);
279 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, input1_qua_info);
280 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(in2, input2_qua_info);
281 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100282
283 // Quantize output
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100284 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100285 *(output_ptr + x) = tmp_qua;
286 }
287 },
288 input1, input2, output);
289 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100290}
291
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100292void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
293{
294 const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
295 const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
296 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
297
298 // Create input windows
299 Window win = window;
300 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
301 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
302
303 // Clear X Dimension on execution window as we handle manually
304 win.set(Window::DimX, Window::Dimension(0, 1, 1));
305 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
306 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
307
308 Iterator input1(in1, input1_win);
309 Iterator input2(in2, input2_win);
310 Iterator output(out, win);
311
312 const int window_step_x = 16;
313 const auto window_start_x = static_cast<int>(window.x().start());
314 const auto window_end_x = static_cast<int>(window.x().end());
315
316 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
317
318 execute_window_loop(win, [&](const Coordinates &)
319 {
320 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
321 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
322 const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
323
324 // Compute window_step_x elements per iteration
325 int x = window_start_x;
326 for(; x <= (window_end_x - window_step_x); x += window_step_x)
327 {
328 const qsymm16x8x2_t input1_q =
329 {
330 {
331 vld1q_s16(input1_ptr + x),
332 vld1q_s16(input1_ptr + x + 8),
333 }
334 };
335 const qsymm16x8x2_t input2_q =
336 {
337 {
338 vld1q_s16(input2_ptr + x),
339 vld1q_s16(input2_ptr + x + 8),
340 }
341 };
342
343 // Dequantize inputs
344 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
345 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
346
347 const float32x4x4_t out_f32x4x4 =
348 {
349 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
350 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
351 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
352 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
353 };
354
355 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
356 vst1q_s16(output_ptr + x, result.val[0]);
357 vst1q_s16(output_ptr + x + 8, result.val[1]);
358 }
359
360 // Compute left-over elements
361 for(; x < window_end_x; ++x)
362 {
363 // Dequantize inputs
364 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
365 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
366 float tmp_f = tmp_in1 * tmp_in2;
367
368 // Quantize output, lrintf() has same rounding mode as vcombine_s16
369 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
370 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
371 *(output_ptr + x) = tmp_qua;
372 }
373 },
374 input1, input2, output);
375}
376
377void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100378{
379 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100380
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100381 // Create input windows
382 Window win = window;
383 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
384 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100385
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100386 // Clear X Dimension on execution window as we handle manually
387 win.set(Window::DimX, Window::Dimension(0, 1, 1));
388 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
389 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100390
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100391 Iterator input1(in1, input1_win);
392 Iterator input2(in2, input2_win);
393 Iterator output(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100394
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100395 const int window_step_x = 16;
396 const auto window_start_x = static_cast<int>(window.x().start());
397 const auto window_end_x = static_cast<int>(window.x().end());
398
399 execute_window_loop(win, [&](const Coordinates &)
400 {
401 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
402 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
403 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
404
405 // Compute window_step_x elements per iteration
406 int x = window_start_x;
407 for(; x <= (window_end_x - window_step_x); x += window_step_x)
408 {
409 const qsymm16x8x2_t input1_q =
410 {
411 {
412 vld1q_s16(input1_ptr + x),
413 vld1q_s16(input1_ptr + x + 8),
414 }
415 };
416 const qsymm16x8x2_t input2_q =
417 {
418 {
419 vld1q_s16(input2_ptr + x),
420 vld1q_s16(input2_ptr + x + 8),
421 }
422 };
423
424 const int32x4x4_t in1_s32 =
425 {
426 {
427 vmovl_s16(vget_low_s16(input1_q.val[0])),
428 vmovl_s16(vget_high_s16(input1_q.val[0])),
429 vmovl_s16(vget_low_s16(input1_q.val[1])),
430 vmovl_s16(vget_high_s16(input1_q.val[1])),
431 }
432 };
433 const int32x4x4_t in2_s32 =
434 {
435 {
436 vmovl_s16(vget_low_s16(input2_q.val[0])),
437 vmovl_s16(vget_high_s16(input2_q.val[0])),
438 vmovl_s16(vget_low_s16(input2_q.val[1])),
439 vmovl_s16(vget_high_s16(input2_q.val[1])),
440 }
441 };
442
443 const int32x4x4_t result =
444 {
445 {
446 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
447 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
448 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
449 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
450 }
451 };
452
453 vst1q_s32(output_ptr + x, result.val[0]);
454 vst1q_s32(output_ptr + x + 4, result.val[1]);
455 vst1q_s32(output_ptr + x + 8, result.val[2]);
456 vst1q_s32(output_ptr + x + 12, result.val[3]);
457 }
458
459 // Compute left-over elements
460 for(; x < window_end_x; ++x)
461 {
462 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
463 *(output_ptr + x) = tmp;
464 }
465 },
466 input1, input2, output);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100467}
468
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100469template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100470void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100471{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100472 // Create input windows
473 Window win = window;
474 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
475 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100476
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100477 // Clear X Dimension on execution window as we handle manually
478 win.set(Window::DimX, Window::Dimension(0, 1, 1));
479 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
480 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100481
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100482 Iterator input1(in1, input1_win);
483 Iterator input2(in2, input2_win);
484 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100485
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100486 const int window_step_x = 16 / sizeof(uint8_t);
487 const auto window_start_x = static_cast<int>(window.x().start());
488 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100489
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100490 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100491 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100492 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
493 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
494 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100495
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100496 // Compute window_step_x elements per iteration
497 int x = window_start_x;
498 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100499 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100500 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
501 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100502
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100503 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
504 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
505 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
506 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
507
508 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
509 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
510
511 if(is_scale255)
512 {
513 tmp1_high = scale255_U16_U16(tmp1_high);
514 tmp1_low = scale255_U16_U16(tmp1_low);
515 }
516 else
517 {
518 const int16x8_t vn = vdupq_n_s16(-n);
519
520 if(is_sat)
521 {
522 tmp1_high = vqshlq_u16(tmp1_high, vn);
523 tmp1_low = vqshlq_u16(tmp1_low, vn);
524 }
525 else
526 {
527 tmp1_high = vshlq_u16(tmp1_high, vn);
528 tmp1_low = vshlq_u16(tmp1_low, vn);
529 }
530 }
531 if(is_sat)
532 {
533 vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
534 }
535 else
536 {
537 vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
538 }
539 }
540
541 // Compute left-over elements
542 for(; x < window_end_x; ++x)
543 {
544 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
545
546 if(is_scale255)
547 {
548 float tmp_f = static_cast<float>(tmp) * scale255_constant;
549 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
550 }
551 else
552 {
553 tmp >>= n;
554 }
555 if(is_sat && tmp > 255)
556 {
557 tmp = 255;
558 }
559 *(output_ptr + x) = static_cast<uint8_t>(tmp);
560 }
561 },
562 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100563}
564
565template <bool is_scale255, bool is_sat>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100566inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
567{
568 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
569 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
570 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));
571 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));
572
573 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
574 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
575
576 if(is_scale255)
577 {
578 tmp1_high = scale255_S32_S32(tmp1_high);
579 tmp1_low = scale255_S32_S32(tmp1_low);
580 }
581 else
582 {
583 // Right shift amount
584 const int32x4_t vn = vdupq_n_s32(-n);
585 // Left shift amount
586 const int32x4_t vnl = vdupq_n_s32(n);
587 // Calculate conversion bit
588 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
589 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
590 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
591 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
592 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
593 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
594 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
595 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
596 if(is_sat)
597 {
598 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
599 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
600 }
601 else
602 {
603 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
604 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
605 }
606 }
607
608 if(is_sat)
609 {
610 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
611 }
612 else
613 {
614 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
615 }
616}
617
618template <bool is_scale255, bool is_sat>
619inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
620{
621 const int16x8x2_t result =
622 {
623 {
624 // First 8 elements
625 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
626 // Second 8 elements
627 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
628 }
629 };
630
631 return result;
632}
633
634template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100635void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100636{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100637 // Create input windows
638 Window win = window;
639 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
640 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100641
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100642 // Clear X Dimension on execution window as we handle manually
643 win.set(Window::DimX, Window::Dimension(0, 1, 1));
644 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
645 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100646
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100647 Iterator input1(in1, input1_win);
648 Iterator input2(in2, input2_win);
649 Iterator output(out, win);
650
651 const int window_step_x = 16;
652 const auto window_start_x = static_cast<int>(window.x().start());
653 const auto window_end_x = static_cast<int>(window.x().end());
654
655 execute_window_loop(win, [&](const Coordinates &)
656 {
657 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
658 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
659 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
660
661 // Compute window_step_x elements per iteration
662 int x = window_start_x;
663 for(; x <= (window_end_x - window_step_x); x += window_step_x)
664 {
665 const int16x8x2_t ta1 =
666 {
667 {
668 vld1q_s16(input1_ptr + x),
669 vld1q_s16(input1_ptr + x + 8),
670 }
671 };
672 const int16x8x2_t ta2 =
673 {
674 {
675 vld1q_s16(input2_ptr + x),
676 vld1q_s16(input2_ptr + x + 8),
677 }
678 };
679 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
680
681 vst1q_s16(output_ptr + x, result.val[0]);
682 vst1q_s16(output_ptr + x + 8, result.val[1]);
683 }
684
685 // Compute left-over elements
686 for(; x < window_end_x; ++x)
687 {
688 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
689
690 if(is_scale255)
691 {
692 float tmp_f = static_cast<float>(tmp) * scale255_constant;
693
694 tmp = static_cast<int32_t>(tmp_f + 0.5f);
695 }
696 else
697 {
698 if(tmp >= 0)
699 {
700 tmp >>= n;
701 }
702 else
703 {
704 uint32_t mask = (1u << n) - 1;
705 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
706 }
707 }
708 if(is_sat)
709 {
710 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
711 }
712 *(output_ptr + x) = static_cast<int16_t>(tmp);
713 }
714 },
715 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100716}
717
SiCong Libb88f892020-08-28 11:18:47 +0100718template <bool is_sat>
719inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &input1, const int32x4_t &input2, int n)
720{
721 const int32x2_t input1_1 = vget_low_s32(input1);
722 const int32x2_t input2_1 = vget_low_s32(input2);
723 const int32x2_t input1_2 = vget_high_s32(input1);
724 const int32x2_t input2_2 = vget_high_s32(input2);
725
726 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
727 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
728
729 // Apply scaling, conversion and rounding (round to zero)
730 // Right shift amount
731 const int64x2_t vn = vdupq_n_s64(-n);
732 // Left shift amount
733 const int64x2_t vnl = vdupq_n_s64(n);
734 // Calculate conversion bit
735 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
736 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
737 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
738 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
739
740 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
741 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
742 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
743 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
744 if(is_sat)
745 {
746 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
747 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
748 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
749 }
750 else
751 {
752 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
753 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
754 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
755 }
756}
757
758template <bool is_sat>
759inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &input1, const int32x4x2_t &input2, int n)
760{
761 const int32x4x2_t result =
762 {
763 {
764 // First 4 elements
765 mul_S32_S32_S32_n_loop<is_sat>(input1.val[0], input2.val[0], n),
766 // Second 4 elements
767 mul_S32_S32_S32_n_loop<is_sat>(input1.val[1], input2.val[1], n)
768 }
769 };
770
771 return result;
772}
773
774template <bool is_sat>
775void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
776{
777 // Create input windows
SiCong Libb88f892020-08-28 11:18:47 +0100778 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
779 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
780
781 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +0100782 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +0100783 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +0100784
SiCong Lid6d1b362020-09-24 17:34:23 +0100785 const int window_step_x = 8;
786 const auto window_start_x = static_cast<int>(window.x().start());
787 const auto window_end_x = static_cast<int>(window.x().end());
788 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
SiCong Libb88f892020-08-28 11:18:47 +0100789
SiCong Lid6d1b362020-09-24 17:34:23 +0100790 if(is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +0100791 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100792 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
793 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
794 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
795 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
796 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
SiCong Libb88f892020-08-28 11:18:47 +0100797
SiCong Lid6d1b362020-09-24 17:34:23 +0100798 // Clear X Dimension on execution window as we handle manually
799 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
800
801 Iterator broadcast_input(broadcast_tensor, broadcast_win);
802 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
803 Iterator output(out, win);
804
805 execute_window_loop(win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +0100806 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100807 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
808 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
SiCong Libb88f892020-08-28 11:18:47 +0100809
SiCong Lid6d1b362020-09-24 17:34:23 +0100810 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
811 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
SiCong Libb88f892020-08-28 11:18:47 +0100812
SiCong Lid6d1b362020-09-24 17:34:23 +0100813 // Compute window_step_x elements per iteration
814 int x = window_start_x;
815 for(; x <= (window_end_x - window_step_x); x += window_step_x)
816 {
817 const int32x4x2_t broadcast_v =
818 {
819 {
820 broadcast_value_vec,
821 broadcast_value_vec,
822 }
823 };
824 const int32x4x2_t non_broadcast_v =
825 {
826 {
827 vld1q_s32(non_broadcast_input_ptr + x),
828 vld1q_s32(non_broadcast_input_ptr + x + 4),
829 }
830 };
831 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
832
833 vst1q_s32(output_ptr + x, result.val[0]);
834 vst1q_s32(output_ptr + x + 4, result.val[1]);
835 }
836
837 // Compute left-over elements
838 for(; x < window_end_x; ++x)
839 {
840 int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
841
842 if(tmp >= 0)
843 {
844 tmp >>= n;
845 }
846 else
847 {
848 uint64_t mask = (1u << n) - 1;
849 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
850 }
851 if(is_sat)
852 {
853 tmp = utility::clamp<int64_t, int32_t>(tmp);
854 }
855 *(output_ptr + x) = static_cast<int32_t>(tmp);
856 }
857 },
858 broadcast_input, non_broadcast_input, output);
859 }
860 else
861 {
862 // Clear X Dimension on execution window as we handle manually
863 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
864 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
865
866 Iterator input1(in1, input1_win);
867 Iterator input2(in2, input2_win);
868 Iterator output(out, win);
869
870 execute_window_loop(win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +0100871 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100872 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
873 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
874 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
SiCong Libb88f892020-08-28 11:18:47 +0100875
SiCong Lid6d1b362020-09-24 17:34:23 +0100876 // Compute window_step_x elements per iteration
877 int x = window_start_x;
878 for(; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Libb88f892020-08-28 11:18:47 +0100879 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100880 const int32x4x2_t ta1 =
881 {
882 {
883 vld1q_s32(input1_ptr + x),
884 vld1q_s32(input1_ptr + x + 4),
885 }
886 };
887 const int32x4x2_t ta2 =
888 {
889 {
890 vld1q_s32(input2_ptr + x),
891 vld1q_s32(input2_ptr + x + 4),
892 }
893 };
894 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
895
896 vst1q_s32(output_ptr + x, result.val[0]);
897 vst1q_s32(output_ptr + x + 4, result.val[1]);
SiCong Libb88f892020-08-28 11:18:47 +0100898 }
SiCong Lid6d1b362020-09-24 17:34:23 +0100899
900 // Compute left-over elements
901 for(; x < window_end_x; ++x)
SiCong Libb88f892020-08-28 11:18:47 +0100902 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100903 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
904
905 if(tmp >= 0)
906 {
907 tmp >>= n;
908 }
909 else
910 {
911 uint64_t mask = (1u << n) - 1;
912 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
913 }
914 if(is_sat)
915 {
916 tmp = utility::clamp<int64_t, int32_t>(tmp);
917 }
918 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Libb88f892020-08-28 11:18:47 +0100919 }
SiCong Lid6d1b362020-09-24 17:34:23 +0100920 },
921 input1, input2, output);
922 }
SiCong Libb88f892020-08-28 11:18:47 +0100923}
924
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100925void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100926{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100927 // Create input windows
928 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
929 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100930
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100931 // Clear X Dimension on execution window as we handle manually
932 Window win = window;
933 win.set(Window::DimX, Window::Dimension(0, 1, 1));
934
935 constexpr int window_step_x = 16 / sizeof(float);
936 const auto window_start_x = static_cast<int>(window.x().start());
937 const auto window_end_x = static_cast<int>(window.x().end());
938 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
939
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100940 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
941
942 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100943 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100944 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
945 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
946 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
947 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
948 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
949
950 // Clear X Dimension on execution window as we handle manually
951 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
952
953 Iterator broadcast_input(broadcast_tensor, broadcast_win);
954 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
955 Iterator output(out, win);
956
957 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100958 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100959 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
960 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
961
962 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
963 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
964 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
965
966 // Compute window_step_x elements per iteration
967 int x = window_start_x;
968 for(; x <= (window_end_x - window_step_x); x += window_step_x)
969 {
970 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
971 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
972 wrapper::vstore(output_ptr + x, res);
973 }
974
975 // Compute left-over elements
976 for(; x < window_end_x; ++x)
977 {
978 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
979 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
980 }
981 },
982 broadcast_input, non_broadcast_input, output);
983 }
984 else
985 {
986 // Clear X Dimension on execution window as we handle manually
987 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
988 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
989
990 Iterator input1(in1, input1_win);
991 Iterator input2(in2, input2_win);
992 Iterator output(out, win);
993
994 execute_window_loop(win, [&](const Coordinates &)
995 {
996 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
997 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
998 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
999
1000 // Compute window_step_x elements per iteration
1001 int x = window_start_x;
1002 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1003 {
1004 const auto ta1 = wrapper::vloadq(input1_ptr + x);
1005 const auto ta2 = wrapper::vloadq(input2_ptr + x);
1006 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1007 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1008 wrapper::vstore(output_ptr + x, res);
1009 }
1010
1011 // Compute left-over elements
1012 for(; x < window_end_x; ++x)
1013 {
1014 const auto ta1 = *(input1_ptr + x);
1015 const auto ta2 = *(input2_ptr + x);
1016 *(output_ptr + x) = ta1 * ta2 * scale;
1017 }
1018 },
1019 input1, input2, output);
1020 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001021}
1022
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001023void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001024{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001025 // Create input windows
1026 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1027 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001028
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001029 // Clear X Dimension on execution window as we handle manually
1030 Window win = window;
1031 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001032
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001033 constexpr int window_step_x = 8 / sizeof(float);
1034 const auto window_start_x = static_cast<int>(window.x().start());
1035 const auto window_end_x = static_cast<int>(window.x().end());
1036 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
giuros01154bc1c2019-03-26 17:44:40 +00001037
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001038 if(is_broadcast_across_x)
1039 {
1040 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1041 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1042 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1043 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
1044 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
giuros01154bc1c2019-03-26 17:44:40 +00001045
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001046 // Clear X Dimension on execution window as we handle manually
1047 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001048
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001049 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1050 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1051 Iterator output(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001052
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001053 execute_window_loop(win, [&](const Coordinates &)
1054 {
1055 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
1056 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001057
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001058 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1059
1060 int x = window_start_x;
1061 // Compute left-over elements
1062 for(; x < window_end_x; ++x)
1063 {
1064 const auto broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1065 const auto broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1066 auto res1 = broadcast_value * (broadcast_value0 - broadcast_value1);
1067 auto res2 = broadcast_value * (broadcast_value1 + broadcast_value0);
1068 *(output_ptr + 2 * x) = res1;
1069 *(output_ptr + 2 * x + 1) = res2;
1070 }
1071 },
1072 broadcast_input, non_broadcast_input, output);
1073 }
1074 else
1075 {
1076 // Clear X Dimension on execution window as we handle manually
1077 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1078 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1079
1080 Iterator input1(in1, input1_win);
1081 Iterator input2(in2, input2_win);
1082 Iterator output(out, win);
1083
1084 execute_window_loop(win, [&](const Coordinates &)
1085 {
1086 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1087 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
1088 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
1089
1090 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1091
1092 // Compute window_step_x elements per iteration
1093 int x = window_start_x;
1094 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1095 {
1096 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1097 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1098
1099 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1100 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1101 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1102 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1103 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1104
1105 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1106 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1107
1108 float32x4_t res = wrapper::vmul(tmp0, b);
1109
1110 b = wrapper::vrev64(b);
1111 b = wrapper::vmul(b, mask);
1112
1113 res = wrapper::vmla(res, tmp1, b);
1114 wrapper::vstore(output_ptr + 2 * x, res);
1115 }
1116
1117 // Compute left-over elements
1118 for(; x < window_end_x; ++x)
1119 {
1120 const auto a0 = *(input1_ptr + 2 * x);
1121 const auto a1 = *(input1_ptr + 2 * x + 1);
1122 const auto b0 = *(input2_ptr + 2 * x);
1123 const auto b1 = *(input2_ptr + 2 * x + 1);
1124 auto res1 = a0 * b0 - a1 * b1;
1125 auto res2 = a0 * b1 + a1 * b0;
1126 *(output_ptr + 2 * x) = res1;
1127 *(output_ptr + 2 * x + 1) = res2;
1128 }
1129 },
1130 input1, input2, output);
1131 }
giuros01154bc1c2019-03-26 17:44:40 +00001132}
1133
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001134#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001135void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
1136{
1137 // Create input windows
1138 Window win = window;
1139 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1140 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1141
1142 // Clear X Dimension on execution window as we handle manually
1143 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1144 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1145 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1146
1147 Iterator input1(in1, input1_win);
1148 Iterator input2(in2, input2_win);
1149 Iterator output(out, win);
1150
1151 const int window_step_x = 16;
1152 const auto window_start_x = static_cast<int>(window.x().start());
1153 const auto window_end_x = static_cast<int>(window.x().end());
1154
1155 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001156 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001157 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1158 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
1159 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1160
1161 // Compute window_step_x elements per iteration
1162 int x = window_start_x;
1163 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001164 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001165 const float16x8x2_t ta1 =
1166 {
1167 {
1168 vld1q_f16(input1_ptr + x),
1169 vld1q_f16(input1_ptr + x + 8),
1170 }
1171 };
1172 const float16x8x2_t ta2 =
1173 {
1174 {
1175 vld1q_f16(input2_ptr + x),
1176 vld1q_f16(input2_ptr + x + 8),
1177 }
1178 };
1179 const float16x8_t scale_vec = vdupq_n_f16(scale);
1180 const float16x8x2_t result =
1181 {
1182 {
1183 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1184 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1185 }
1186 };
1187 vst1q_f16(output_ptr + x, result.val[0]);
1188 vst1q_f16(output_ptr + x + 8, result.val[1]);
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001189 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001190
1191 // Compute left-over elements
1192 for(; x < window_end_x; ++x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001193 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001194 const auto ta1 = *(input1_ptr + x);
1195 const auto ta2 = *(input2_ptr + x);
1196 *(output_ptr + x) = ta1 * ta2 * scale;
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001197 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001198 },
1199 input1, input2, output);
1200}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001201#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +01001202
1203template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001204void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001205{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001206 // Create input windows
1207 Window win = window;
1208 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1209 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001210
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001211 // Clear X Dimension on execution window as we handle manually
1212 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1213 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1214 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001215
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001216 Iterator input1(in1, input1_win);
1217 Iterator input2(in2, input2_win);
1218 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001219
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001220 const int window_step_x = 16 / sizeof(uint8_t);
1221 const auto window_start_x = static_cast<int>(window.x().start());
1222 const auto window_end_x = static_cast<int>(window.x().end());
1223
1224 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001225 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001226 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1227 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
1228 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001229
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001230 // Compute window_step_x elements per iteration
1231 int x = window_start_x;
1232 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001233 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001234 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1235 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1236
1237 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1238 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1239 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1240 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1241
1242 if(is_scale255)
1243 {
1244 tmp_low = scale255_U16_U16(tmp_low);
1245 tmp_high = scale255_U16_U16(tmp_high);
1246 }
1247 else
1248 {
1249 const int16x8_t vn = vdupq_n_s16(-n);
1250
1251 if(is_sat)
1252 {
1253 tmp_low = vqshlq_u16(tmp_low, vn);
1254 tmp_high = vqshlq_u16(tmp_high, vn);
1255 }
1256 else
1257 {
1258 tmp_low = vshlq_u16(tmp_low, vn);
1259 tmp_high = vshlq_u16(tmp_high, vn);
1260 }
1261 }
1262
1263 if(is_sat)
1264 {
1265 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1266
1267 tmp_low = vminq_u16(tmp_low, max);
1268 tmp_high = vminq_u16(tmp_high, max);
1269 }
1270
1271 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1272 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001273 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001274
1275 // Compute left-over elements
1276 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001277 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001278 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1279
1280 if(is_scale255)
1281 {
1282 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1283 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1284 }
1285 else
1286 {
1287 tmp >>= n;
1288 }
1289
1290 if(is_sat)
1291 {
1292 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1293 }
1294
1295 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001296 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001297 },
1298 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001299}
1300
1301template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001302void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001303{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001304 // Create input windows
1305 Window win = window;
1306 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1307 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001308
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001309 // Clear X Dimension on execution window as we handle manually
1310 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1311 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1312 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001313
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001314 Iterator input1(in1, input1_win);
1315 Iterator input2(in2, input2_win);
1316 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001317
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001318 const int window_step_x = 16;
1319 const auto window_start_x = static_cast<int>(window.x().start());
1320 const auto window_end_x = static_cast<int>(window.x().end());
1321
1322 execute_window_loop(win, [&](const Coordinates &)
1323 {
1324 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1325 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
1326 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
1327
1328 // Compute window_step_x elements per iteration
1329 int x = window_start_x;
1330 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1331 {
1332 const int16x8x2_t ta1 =
1333 {
1334 {
1335 vld1q_s16(input1_ptr + x),
1336 vld1q_s16(input1_ptr + x + 8),
1337 }
1338 };
1339 const uint8x8x2_t ta2u =
1340 {
1341 {
1342 vld1_u8(input2_ptr + x),
1343 vld1_u8(input2_ptr + x + 8),
1344 }
1345 };
1346 const int16x8x2_t ta2 =
1347 {
1348 {
1349 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1350 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1351 }
1352 };
1353
1354 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1355
1356 vst1q_s16(output_ptr + x, result.val[0]);
1357 vst1q_s16(output_ptr + x + 8, result.val[1]);
1358 }
1359
1360 // Compute left-over elements
1361 for(; x < window_end_x; ++x)
1362 {
1363 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1364
1365 if(is_scale255)
1366 {
1367 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1368
1369 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1370 }
1371 else
1372 {
1373 if(tmp >= 0)
1374 {
1375 tmp >>= n;
1376 }
1377 else
1378 {
1379 uint32_t mask = (1u << n) - 1;
1380 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1381 }
1382 }
1383 if(is_sat)
1384 {
1385 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1386 }
1387 *(output_ptr + x) = static_cast<int16_t>(tmp);
1388 }
1389 },
1390 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001391}
1392
1393template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001394void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001395{
1396 // Simply swap the two input buffers
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001397 mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001398}
1399} // namespace
1400
1401NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001402 : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001403{
1404}
1405
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001406void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001407{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001408 ARM_COMPUTE_UNUSED(rounding_policy);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001409 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
1410
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001411 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001412
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001413 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001414 const TensorShape &out_shape = broadcast_pair.first;
1415 const ValidRegion &valid_region = broadcast_pair.second;
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001416
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001417 // Auto initialize output if not initialized
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001418 set_shape_if_empty(*output, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001419
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001420 _scale = scale;
1421 _scale_exponent = 0;
1422 _func_quantized = nullptr;
1423 _func_int = nullptr;
1424 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001425
1426 bool is_scale_255 = false;
1427 // Check and validate scaling factor
1428 if(std::abs(scale - scale255_constant) < 0.00001f)
1429 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001430 is_scale_255 = true;
1431 }
1432 else
1433 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001434 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001435
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001436 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001437
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001438 // Store the positive exponent. We know that we compute 1/2^n
1439 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1440 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001441 }
1442
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001443 const DataType dt_input1 = input1->data_type();
1444 const DataType dt_input2 = input2->data_type();
1445 const DataType dt_output = output->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001446 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1447
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001448 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001449 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001450 case DataType::QASYMM8:
1451 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1452 {
1453 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1454 }
1455 break;
1456 case DataType::QASYMM8_SIGNED:
1457 if(dt_input2 == DataType::QASYMM8_SIGNED)
1458 {
1459 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1460 ;
1461 }
1462 break;
1463 case DataType::QSYMM16:
1464 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1465 {
1466 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1467 }
1468 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1469 {
1470 _func_int = &mul_QSYMM16_QSYMM16_S32;
1471 }
1472 break;
1473 case DataType::S16:
1474 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1475 {
1476 if(is_scale_255)
1477 {
1478 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1479 }
1480 else
1481 {
1482 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1483 }
1484 }
1485 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1486 {
1487 if(is_scale_255)
1488 {
1489 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1490 }
1491 else
1492 {
1493 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1494 }
1495 }
1496 break;
SiCong Libb88f892020-08-28 11:18:47 +01001497 case DataType::S32:
1498 if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1499 {
1500 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1501 }
1502 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001503 case DataType::U8:
1504 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1505 {
1506 if(is_scale_255)
1507 {
1508 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1509 }
1510 else
1511 {
1512 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1513 }
1514 }
1515 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1516 {
1517 if(is_scale_255)
1518 {
1519 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1520 }
1521 else
1522 {
1523 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1524 }
1525 }
1526 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1527 {
1528 if(is_scale_255)
1529 {
1530 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1531 }
1532 else
1533 {
1534 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1535 }
1536 }
1537 break;
1538#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1539 case DataType::F16:
1540 _func_float = &mul_F16_F16_F16;
1541 break;
1542#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1543 case DataType::F32:
1544 _func_float = &mul_F32_F32_F32;
1545 break;
1546 default:
1547 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001548 }
1549
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001550 // Configure kernel window
1551 Coordinates coord;
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001552 coord.set_num_dimensions(output->num_dimensions());
1553 output->set_valid_region(valid_region);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001554 Window win = calculate_max_window(valid_region, Steps());
1555
1556 INEKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001557}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001558
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001559Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
1560 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001561{
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001562 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001563 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001564
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001565 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001566}
1567
Georgios Pinitas0499dff2020-07-31 22:21:38 +01001568void NEPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001569{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001570 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001571 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1572 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1573
Georgios Pinitas0499dff2020-07-31 22:21:38 +01001574 auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1575 auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1576 auto output = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001577
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001578 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001579 {
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001580 (*_func_quantized)(input1, input2, output, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001581 }
1582 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001583 {
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001584 (*_func_int)(input1, input2, output, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001585 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001586 else
1587 {
1588 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001589 (*_func_float)(input1, input2, output, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001590 }
1591}
giuros01154bc1c2019-03-26 17:44:40 +00001592namespace
1593{
giuros01154bc1c2019-03-26 17:44:40 +00001594Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
1595{
1596 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
1597 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
1598
1599 const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
1600
1601 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
1602
1603 // Validate in case of configured output
1604 if(output->total_size() > 0)
1605 {
1606 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
1607 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
1608 }
1609
1610 return Status{};
1611}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001612} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00001613
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001614void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
giuros01154bc1c2019-03-26 17:44:40 +00001615{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001616 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
1617 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
1618
giuros01154bc1c2019-03-26 17:44:40 +00001619 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
1620 const TensorShape &out_shape = broadcast_pair.first;
1621 const ValidRegion &valid_region = broadcast_pair.second;
1622
1623 // Auto initialize output if not initialized
1624 const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
1625 auto_init_if_empty(*output, out_info);
1626
giuros01154bc1c2019-03-26 17:44:40 +00001627 // Configure kernel window
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001628 Coordinates coord;
1629 coord.set_num_dimensions(output->num_dimensions());
1630 output->set_valid_region(valid_region);
1631 Window win = calculate_max_window(valid_region, Steps());
giuros01154bc1c2019-03-26 17:44:40 +00001632
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001633 INEKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00001634}
1635
1636Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
1637{
1638 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
1639 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
giuros01154bc1c2019-03-26 17:44:40 +00001640
1641 return Status{};
1642}
1643
Georgios Pinitas0499dff2020-07-31 22:21:38 +01001644void NEComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00001645{
1646 ARM_COMPUTE_UNUSED(info);
1647 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1648 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1649
Georgios Pinitas0499dff2020-07-31 22:21:38 +01001650 auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1651 auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1652 auto output = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001653
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001654 c_mul_F32_F32_F32_n(input1, input2, output, window);
giuros01154bc1c2019-03-26 17:44:40 +00001655}
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001656} // namespace arm_compute