blob: 80014821548f000b677328c9fc129d2ed563c759 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Pablo Marquez Tello568aab62023-11-20 14:20:01 +00002 * Copyright (c) 2016-2023 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuMulKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Sheri Zhang1e3ab422021-03-16 17:35:08 +000026#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/TensorInfo.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010028
Pablo Marquez Tello568aab62023-11-20 14:20:01 +000029#include "src/core/common/Registrars.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010030#include "src/core/CPP/Validate.h"
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010031#include "src/core/helpers/AutoConfiguration.h"
32#include "src/core/helpers/WindowHelpers.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010033#include "src/core/NEON/NEAsymm.h"
34#include "src/core/NEON/NESymm.h"
35#include "src/core/NEON/wrapper/wrapper.h"
Pablo Marquez Tello568aab62023-11-20 14:20:01 +000036#include "src/cpu/kernels/mul/generic/neon/list.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037
38#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010039
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000040namespace
41{
Gunes Bayirf16973b2022-11-29 13:12:08 +000042#if defined(ENABLE_FP32_KERNELS)
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010043static constexpr size_t default_mws_N1_fp32_neon = 22447;
44static constexpr size_t default_mws_V1_fp32_neon = 38982;
Gunes Bayirf16973b2022-11-29 13:12:08 +000045#endif /* ENABLE_FP32_KERNELS */
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010046static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
47} // namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048namespace arm_compute
49{
Sheri Zhang1e3ab422021-03-16 17:35:08 +000050namespace cpu
51{
52namespace kernels
53{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010054namespace
55{
56const float scale255_constant = 1.f / 255.f;
57const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
58const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
59
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010060inline Status validate_arguments(const ITensorInfo *src1,
61 const ITensorInfo *src2,
62 const ITensorInfo *dst,
63 float scale,
64 ConvertPolicy overflow_policy,
65 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000066{
67 ARM_COMPUTE_UNUSED(overflow_policy);
68 ARM_COMPUTE_UNUSED(rounding_policy);
69
Sheri Zhang1e3ab422021-03-16 17:35:08 +000070 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010071 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
72 DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
73 DataType::QSYMM16, DataType::F16, DataType::F32);
74 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
75 DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
76 DataType::QSYMM16, DataType::F16, DataType::F32);
77 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
78 DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
Michele Di Giorgio9428a182020-03-30 14:10:20 +010079 DataType::S32, DataType::F16, DataType::F32);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010080 if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000081 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000082 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010083 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP,
84 "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000085 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000086
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010087 if (dst->total_size() > 0)
Manuel Bottini79fa9a22019-02-22 17:54:22 +000088 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000089 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +010090 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
91 "Wrong shape for dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000092 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010093 // clang-format off
94 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Sheri Zhang1e3ab422021-03-16 17:35:08 +000095 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
96 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
97 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
98 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
99 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
100 !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
SiCong Libb88f892020-08-28 11:18:47 +0100101 , "Invalid data type combination");
102 // clang-format on
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100103 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 &&
104 scale != 1.f,
105 "Unsupported scale for QSYMM16 inputs and S32 dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000106 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +0000107
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100108 if (std::abs(scale - scale255_constant) < 0.00001f)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000109 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100110 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP &&
111 rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
112 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 &&
113 dst->data_type() == DataType::S32,
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000114 "Scale == 1/255 is not supported if input and dst are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000115 }
116 else
117 {
118 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
119
120 int exponent = 0;
121 const float normalized_mantissa = std::frexp(scale, &exponent);
122
123 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
124 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
125 // Moreover, it will be negative as we deal with 1/2^n
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100126 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
127 "Scale value not supported (Should be 1/(2^n) or 1/255");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000128 }
129
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000130 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000131}
132
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133/* Scales a given vector by 1/255.
134 *
135 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
136 *
137 * @param in Input vector to scale.
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000138 * @return Scaled dst rounded to nearest (round half up).
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100139 */
140inline int32x4_t scale255_S32_S32(int32x4_t in)
141{
142 // Scale
143 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
144 // Round to nearest (round half up)
145 // Add +0.5 for all values
146 // Afterwards vcvt rounds toward zero
147 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
148}
149
150inline uint16x8_t scale255_U16_U16(uint16x8_t in)
151{
152 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
153 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
154 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
155}
156
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100157template <typename T>
158inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
159vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000160{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100161 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000162}
163
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100164template <typename T>
165inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
166vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000167{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100168 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000169}
170
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100171template <typename T>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000172void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100173{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100174 // Create input windows
175 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000176 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
177 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100178
179 // Clear X Dimension on execution window as we handle manually
180 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100181
Sheri Zhanga449a362020-07-16 15:52:25 +0100182 const int window_step_x = 16 / sizeof(T);
183 const auto window_start_x = static_cast<int>(window.x().start());
184 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000185 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100186
Sheri Zhanga449a362020-07-16 15:52:25 +0100187 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100188 const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100189
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100190 if (is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100191 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100192 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
193 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
194 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000195 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
196 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhanga449a362020-07-16 15:52:25 +0100197 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100198 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100199
Sheri Zhanga449a362020-07-16 15:52:25 +0100200 // Clear X Dimension on execution window as we handle manually
201 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
202
203 Iterator broadcast_input(broadcast_tensor, broadcast_win);
204 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000205 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100206
207 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
208
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000209 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100210 win,
211 [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100212 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100213 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
214 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100215
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100216 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
217 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100218
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100219 // Compute window_step_x elements per iteration
220 int x = window_start_x;
221 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhanga449a362020-07-16 15:52:25 +0100222 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100223 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhanga449a362020-07-16 15:52:25 +0100224
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100225 // Dequantize inputs
226 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
227 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhanga449a362020-07-16 15:52:25 +0100228
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100229 const float32x4x4_t out_f32x4x4 = {
230 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
231 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
232 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
233 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
234 };
Sheri Zhanga449a362020-07-16 15:52:25 +0100235
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100236 // Quantize dst
237 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
238 wrapper::vstore(output_ptr + x, result);
239 }
240
241 // Compute left-over elements
242 for (; x < window_end_x; ++x)
243 {
244 // Dequantize inputs
245 const T src1 = *(non_broadcast_input_ptr + x);
246 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
247 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
248 const float tmp_f = tmp_in1 * tmp_in2;
249
250 // Quantize dst
251 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
252 *(output_ptr + x) = tmp_qua;
253 }
254 },
255 broadcast_input, non_broadcast_input, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100256 }
257 else
258 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000259 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
260 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhanga449a362020-07-16 15:52:25 +0100261
262 // Clear X Dimension on execution window as we handle manually
263 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
264 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
265
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000266 Iterator input1(src1, input1_win);
267 Iterator input2(src2, input2_win);
268 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100269
Omar Al Khatib605a9282022-11-01 17:01:24 +0000270 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100271 win,
272 [&](const Coordinates &)
Sheri Zhanga449a362020-07-16 15:52:25 +0100273 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100274 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
275 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
276 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhanga449a362020-07-16 15:52:25 +0100277
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100278 // Compute window_step_x elements per iteration
279 int x = window_start_x;
280 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhanga449a362020-07-16 15:52:25 +0100281 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100282 const auto input1_q = wrapper::vloadq(input1_ptr + x);
283 const auto input2_q = wrapper::vloadq(input2_ptr + x);
Sheri Zhanga449a362020-07-16 15:52:25 +0100284
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100285 // Dequantize inputs
286 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
287 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100288
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100289 const float32x4x4_t out_f32x4x4 = {
290 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
291 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
292 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
293 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
294 };
Sheri Zhanga449a362020-07-16 15:52:25 +0100295
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100296 // Quantize dst
297 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
298 wrapper::vstore(output_ptr + x, result);
299 }
300
301 // Compute left-over elements
302 for (; x < window_end_x; ++x)
303 {
304 // Dequantize inputs
305 const T src1 = *(input1_ptr + x);
306 const T src2 = *(input2_ptr + x);
307 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
308 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
309 const float tmp_f = tmp_in1 * tmp_in2;
310
311 // Quantize dst
312 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
313 *(output_ptr + x) = tmp_qua;
314 }
315 },
316 input1, input2, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100317 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100318}
319
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100320bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
321 const ITensorInfo *src1,
322 const ITensorInfo *dst,
323 float scale)
Omar Al Khatib605a9282022-11-01 17:01:24 +0000324{
325 const auto iq0 = src0->quantization_info().uniform();
326 const auto iq1 = src1->quantization_info().uniform();
327 const auto oq = dst->quantization_info().uniform();
328
329 const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
330
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100331 if (multiplier < -8191.f || multiplier > 8191.f)
Omar Al Khatib605a9282022-11-01 17:01:24 +0000332 {
333 //The multiplier cannot be stored as a 14.18 signed fixed-point number
334 return false;
335 }
336
337 const auto offset_out = float(oq.offset);
338
339 const auto max_result = multiplier * (256) * (256) + offset_out;
340
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100341 if (max_result > 8191.f)
Omar Al Khatib605a9282022-11-01 17:01:24 +0000342 {
343 //It might not be possible to store the result as a 14.18 signed fixed-point number.
344 return false;
345 }
346
347 return true;
348}
349
350template <typename ScalarType>
351void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)
352{
353 const auto in0_info = src0->info();
354 const auto in1_info = src1->info();
355
356 const auto &in0_shape = in0_info->tensor_shape();
357 const auto &in1_shape = in1_info->tensor_shape();
358
359 // Create input windows.
360 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
361 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
362
363 // Clear the x dimension on the execution window as we process the whole row each iteration.
364 Window win = window;
365 win.set(Window::DimX, Window::Dimension(0, 1, 1));
366
367 constexpr int window_step_x = 16;
368 const auto window_start_x = window.x().start();
369 const auto window_end_x = window.x().end();
370 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
371
372 const auto iq0_info = in0_info->quantization_info().uniform();
373 const auto iq1_info = in1_info->quantization_info().uniform();
374 const auto oq_info = dst->info()->quantization_info().uniform();
375
376 const auto in0_offset = iq0_info.offset;
377 const auto in1_offset = iq1_info.offset;
378 const auto out_offset = oq_info.offset;
379 const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;
380
381 constexpr int32_t two_pwr18i = 262144;
382 constexpr float two_pwr18f = 262144.f;
383
384 const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);
385 const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);
386 const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
387 const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
388
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100389 if (is_broadcast_across_x)
Omar Al Khatib605a9282022-11-01 17:01:24 +0000390 {
391 // Prefix: a = non-broadcast, b = broadcast.
392
393 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
394 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
395 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
396 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
397 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
398
399 const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
400 const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
401#ifndef __aarch64__
402 const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
403 const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
404#endif //__aarch64__
405 const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
406
407 // Clear the x dimension on the execution window as we process the whole row each iteration.
408 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
409
410 Iterator a_input_it(a_tensor, a_win);
411 Iterator b_input_it(b_tensor, b_win);
412 Iterator out_it(dst, win);
413
414 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100415 win,
416 [&](const Coordinates &)
Omar Al Khatib605a9282022-11-01 17:01:24 +0000417 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100418 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
419 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
Omar Al Khatib605a9282022-11-01 17:01:24 +0000420 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
421
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100422 const auto b_val = *b_ptr;
423 const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
424 const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
Omar Al Khatib605a9282022-11-01 17:01:24 +0000425
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100426 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
427 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
428
429 int x = window_start_x;
430
431 for (; x <= (window_end_x - window_step_x); x += window_step_x)
432 {
433 // Load the inputs.
434 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
435
436 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
437 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
438 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
439
440 const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
441 const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
442 const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
443 const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
444
445 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
446 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
447 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
448 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
449
450 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
451 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
452 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
453 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
454
455 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
456 const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
457 const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
458 const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
459 const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
460
461 const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01);
462
463 const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11);
464 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
465
466 const auto vout_8p0 =
467 wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
468 wrapper::vstore(out_ptr + x, vout_8p0);
469 }
470
471 //Process the left-over elements.
472 for (; x < window_end_x; ++x)
473 {
Omar Al Khatib605a9282022-11-01 17:01:24 +0000474#ifdef __aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100475 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
476 (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
477 out_offset_14p18)));
Omar Al Khatib605a9282022-11-01 17:01:24 +0000478#else //__aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100479 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
480 multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
Omar Al Khatib605a9282022-11-01 17:01:24 +0000481#endif //__aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100482 }
483 },
484 a_input_it, b_input_it, out_it);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000485 }
486 else
487 {
488 const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
489 const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
490 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
491 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
492
493 // Clear the x dimension on the execution window as we process the whole row each iteration.
494 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
495 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
496
497 Iterator in0_it(src0, in0_win);
498 Iterator in1_it(src1, in1_win);
499 Iterator out_it(dst, win);
500
501 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100502 win,
503 [&](const Coordinates &)
Omar Al Khatib605a9282022-11-01 17:01:24 +0000504 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100505 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
506 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
507 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
Omar Al Khatib605a9282022-11-01 17:01:24 +0000508
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100509 int x = window_start_x;
Omar Al Khatib605a9282022-11-01 17:01:24 +0000510
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100511 for (; x <= (window_end_x - window_step_x); x += window_step_x)
512 {
513 // Load the inputs.
514 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
515 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000516
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100517 // Widen the input elements to signed 16-bit regardless of the input signedness.
518 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
519 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
520 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
521 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
Omar Al Khatib605a9282022-11-01 17:01:24 +0000522
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100523 const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
524 const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
525 const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
526 const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000527
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100528 const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
529 const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
530 const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
531 const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000532
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100533 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
534 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
535 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
536 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000537
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100538 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
539 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
540 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
541 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000542
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100543 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
544 const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
545 const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
546 const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
547 const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
Omar Al Khatib605a9282022-11-01 17:01:24 +0000548
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100549 const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000550
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100551 const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11);
552
553 const auto vout_8p0 =
554 wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
555 wrapper::vstore(out_ptr + x, vout_8p0);
556 }
557
558 //Process the left-over elements.
559 for (; x < window_end_x; ++x)
560 {
Omar Al Khatib605a9282022-11-01 17:01:24 +0000561#ifdef __aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100562 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
563 wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
564 (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
565 out_offset_14p18)));
Omar Al Khatib605a9282022-11-01 17:01:24 +0000566#else //__aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100567 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
568 multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) +
569 float(out_offset)));
Omar Al Khatib605a9282022-11-01 17:01:24 +0000570#endif //__aarch64__
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100571 }
572 },
573 in0_it, in1_it, out_it);
Omar Al Khatib605a9282022-11-01 17:01:24 +0000574 }
575}
576
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100577void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
578 const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100579{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000580 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
581 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100582 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
583
584 // Create input windows
585 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000586 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
587 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100588
589 // Clear X Dimension on execution window as we handle manually
590 win.set(Window::DimX, Window::Dimension(0, 1, 1));
591 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
592 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
593
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000594 Iterator input1(src1, input1_win);
595 Iterator input2(src2, input2_win);
596 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100597
598 const int window_step_x = 16;
599 const auto window_start_x = static_cast<int>(window.x().start());
600 const auto window_end_x = static_cast<int>(window.x().end());
601
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100602 const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100603
Omar Al Khatib605a9282022-11-01 17:01:24 +0000604 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100605 win,
606 [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100607 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100608 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
609 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
610 const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
611
612 // Compute window_step_x elements per iteration
613 int x = window_start_x;
614 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100615 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100616 const qsymm16x8x2_t input1_q = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100617 vld1q_s16(input1_ptr + x),
618 vld1q_s16(input1_ptr + x + 8),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100619 }};
620 const qsymm16x8x2_t input2_q = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100621 vld1q_s16(input2_ptr + x),
622 vld1q_s16(input2_ptr + x + 8),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100623 }};
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100624
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100625 // Dequantize inputs
626 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
627 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100628
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100629 const float32x4x4_t out_f32x4x4 = {
630 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
631 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
632 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
633 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
634 };
635
636 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
637 vst1q_s16(output_ptr + x, result.val[0]);
638 vst1q_s16(output_ptr + x + 8, result.val[1]);
639 }
640
641 // Compute left-over elements
642 for (; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100643 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100644 // Dequantize inputs
645 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
646 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
647 float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100648
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100649 // Quantize dst, lrintf() has same rounding mode as vcombine_s16
650 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
651 qsymm16_t tmp_qua =
652 static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
653 *(output_ptr + x) = tmp_qua;
654 }
655 },
656 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100657}
658
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000659void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100660{
661 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100662
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100663 // Create input windows
664 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000665 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
666 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100667
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100668 // Clear X Dimension on execution window as we handle manually
669 win.set(Window::DimX, Window::Dimension(0, 1, 1));
670 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
671 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100672
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000673 Iterator input1(src1, input1_win);
674 Iterator input2(src2, input2_win);
675 Iterator dst(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100676
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100677 const int window_step_x = 16;
678 const auto window_start_x = static_cast<int>(window.x().start());
679 const auto window_end_x = static_cast<int>(window.x().end());
680
Omar Al Khatib605a9282022-11-01 17:01:24 +0000681 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100682 win,
683 [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100684 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100685 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
686 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
687 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
688
689 // Compute window_step_x elements per iteration
690 int x = window_start_x;
691 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100692 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100693 const qsymm16x8x2_t input1_q = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100694 vld1q_s16(input1_ptr + x),
695 vld1q_s16(input1_ptr + x + 8),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100696 }};
697 const qsymm16x8x2_t input2_q = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100698 vld1q_s16(input2_ptr + x),
699 vld1q_s16(input2_ptr + x + 8),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100700 }};
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100701
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100702 const int32x4x4_t in1_s32 = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100703 vmovl_s16(vget_low_s16(input1_q.val[0])),
704 vmovl_s16(vget_high_s16(input1_q.val[0])),
705 vmovl_s16(vget_low_s16(input1_q.val[1])),
706 vmovl_s16(vget_high_s16(input1_q.val[1])),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100707 }};
708 const int32x4x4_t in2_s32 = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100709 vmovl_s16(vget_low_s16(input2_q.val[0])),
710 vmovl_s16(vget_high_s16(input2_q.val[0])),
711 vmovl_s16(vget_low_s16(input2_q.val[1])),
712 vmovl_s16(vget_high_s16(input2_q.val[1])),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100713 }};
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100714
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100715 const int32x4x4_t result = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100716 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
717 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
718 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
719 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100720 }};
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100721
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100722 vst1q_s32(output_ptr + x, result.val[0]);
723 vst1q_s32(output_ptr + x + 4, result.val[1]);
724 vst1q_s32(output_ptr + x + 8, result.val[2]);
725 vst1q_s32(output_ptr + x + 12, result.val[3]);
726 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100727
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100728 // Compute left-over elements
729 for (; x < window_end_x; ++x)
730 {
731 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
732 *(output_ptr + x) = tmp;
733 }
734 },
735 input1, input2, dst);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100736}
737
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100738template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000739void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100740{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100741 // Create input windows
742 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000743 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
744 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100745
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100746 // Clear X Dimension on execution window as we handle manually
747 win.set(Window::DimX, Window::Dimension(0, 1, 1));
748 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
749 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100750
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000751 Iterator input1(src1, input1_win);
752 Iterator input2(src2, input2_win);
753 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100754
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100755 const int window_step_x = 16 / sizeof(uint8_t);
756 const auto window_start_x = static_cast<int>(window.x().start());
757 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100758
Omar Al Khatib605a9282022-11-01 17:01:24 +0000759 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100760 win,
761 [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100762 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100763 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
764 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
765 const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100766
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100767 // Compute window_step_x elements per iteration
768 int x = window_start_x;
769 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100770 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100771 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
772 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100773
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100774 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
775 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
776 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
777 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
778
779 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
780 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
781
782 if (is_scale255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100783 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100784 tmp1_high = scale255_U16_U16(tmp1_high);
785 tmp1_low = scale255_U16_U16(tmp1_low);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100786 }
787 else
788 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100789 const int16x8_t vn = vdupq_n_s16(-n);
790
791 if (is_sat)
792 {
793 tmp1_high = vqshlq_u16(tmp1_high, vn);
794 tmp1_low = vqshlq_u16(tmp1_low, vn);
795 }
796 else
797 {
798 tmp1_high = vshlq_u16(tmp1_high, vn);
799 tmp1_low = vshlq_u16(tmp1_low, vn);
800 }
801 }
802 if (is_sat)
803 {
804 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
805 }
806 else
807 {
808 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100809 }
810 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100811
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100812 // Compute left-over elements
813 for (; x < window_end_x; ++x)
814 {
815 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100816
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100817 if (is_scale255)
818 {
819 float tmp_f = static_cast<float>(tmp) * scale255_constant;
820 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
821 }
822 else
823 {
824 tmp >>= n;
825 }
826 if (is_sat && tmp > 255)
827 {
828 tmp = 255;
829 }
830 *(output_ptr + x) = static_cast<uint8_t>(tmp);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100831 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100832 },
833 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834}
835
836template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000837inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100838{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000839 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
840 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
841 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
842 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100843
844 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
845 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
846
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100847 if (is_scale255)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100848 {
849 tmp1_high = scale255_S32_S32(tmp1_high);
850 tmp1_low = scale255_S32_S32(tmp1_low);
851 }
852 else
853 {
854 // Right shift amount
855 const int32x4_t vn = vdupq_n_s32(-n);
856 // Left shift amount
857 const int32x4_t vnl = vdupq_n_s32(n);
858 // Calculate conversion bit
859 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
860 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
861 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
862 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
863 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
864 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
865 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
866 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100867 if (is_sat)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100868 {
869 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
870 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
871 }
872 else
873 {
874 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
875 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
876 }
877 }
878
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100879 if (is_sat)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100880 {
881 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
882 }
883 else
884 {
885 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
886 }
887}
888
889template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000890inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100891{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100892 const int16x8x2_t result = {{// First 8 elements
893 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
894 // Second 8 elements
895 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100896
897 return result;
898}
899
900template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000901void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100902{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100903 // Create input windows
904 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000905 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
906 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100907
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100908 // Clear X Dimension on execution window as we handle manually
909 win.set(Window::DimX, Window::Dimension(0, 1, 1));
910 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
911 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100912
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000913 Iterator input1(src1, input1_win);
914 Iterator input2(src2, input2_win);
915 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100916
917 const int window_step_x = 16;
918 const auto window_start_x = static_cast<int>(window.x().start());
919 const auto window_end_x = static_cast<int>(window.x().end());
920
Omar Al Khatib605a9282022-11-01 17:01:24 +0000921 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100922 win,
923 [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100924 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100925 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
926 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
927 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
928
929 // Compute window_step_x elements per iteration
930 int x = window_start_x;
931 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100932 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100933 const int16x8x2_t ta1 = {{
934 vld1q_s16(input1_ptr + x),
935 vld1q_s16(input1_ptr + x + 8),
936 }};
937 const int16x8x2_t ta2 = {{
938 vld1q_s16(input2_ptr + x),
939 vld1q_s16(input2_ptr + x + 8),
940 }};
941 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100942
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100943 vst1q_s16(output_ptr + x, result.val[0]);
944 vst1q_s16(output_ptr + x + 8, result.val[1]);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100945 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100946
947 // Compute left-over elements
948 for (; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100949 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100950 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
951
952 if (is_scale255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100953 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100954 float tmp_f = static_cast<float>(tmp) * scale255_constant;
955
956 tmp = static_cast<int32_t>(tmp_f + 0.5f);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100957 }
958 else
959 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100960 if (tmp >= 0)
961 {
962 tmp >>= n;
963 }
964 else
965 {
966 uint32_t mask = (1u << n) - 1;
967 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
968 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100969 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100970 if (is_sat)
971 {
972 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
973 }
974 *(output_ptr + x) = static_cast<int16_t>(tmp);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100975 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +0100976 },
977 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100978}
979
Omar Al Khatib605a9282022-11-01 17:01:24 +0000980template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000981inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100982{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000983 const int32x2_t input1_1 = vget_low_s32(src1);
984 const int32x2_t input2_1 = vget_low_s32(src2);
985 const int32x2_t input1_2 = vget_high_s32(src1);
986 const int32x2_t input2_2 = vget_high_s32(src2);
SiCong Libb88f892020-08-28 11:18:47 +0100987
988 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
989 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
990
991 // Apply scaling, conversion and rounding (round to zero)
992 // Right shift amount
993 const int64x2_t vn = vdupq_n_s64(-n);
994 // Left shift amount
995 const int64x2_t vnl = vdupq_n_s64(n);
996 // Calculate conversion bit
997 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
998 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
999 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
1000 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
1001
1002 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
1003 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
1004 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
1005 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001006 if (is_sat)
SiCong Libb88f892020-08-28 11:18:47 +01001007 {
1008 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1009 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1010 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
1011 }
1012 else
1013 {
1014 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1015 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1016 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
1017 }
1018}
1019
Omar Al Khatib605a9282022-11-01 17:01:24 +00001020template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001021inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001022{
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001023 const int32x4x2_t result = {{// First 4 elements
1024 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
1025 // Second 4 elements
1026 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
SiCong Libb88f892020-08-28 11:18:47 +01001027
1028 return result;
1029}
1030
1031template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001032void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001033{
1034 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001035 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1036 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
SiCong Libb88f892020-08-28 11:18:47 +01001037
1038 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +01001039 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +01001040 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +01001041
SiCong Lid6d1b362020-09-24 17:34:23 +01001042 const int window_step_x = 8;
1043 const auto window_start_x = static_cast<int>(window.x().start());
1044 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001045 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
SiCong Libb88f892020-08-28 11:18:47 +01001046
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001047 if (is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +01001048 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001049 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1050 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1051 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001052 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1053 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
SiCong Libb88f892020-08-28 11:18:47 +01001054
SiCong Lid6d1b362020-09-24 17:34:23 +01001055 // Clear X Dimension on execution window as we handle manually
1056 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1057
1058 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1059 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001060 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001061
Omar Al Khatib605a9282022-11-01 17:01:24 +00001062 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001063 win,
1064 [&](const Coordinates &)
SiCong Lid6d1b362020-09-24 17:34:23 +01001065 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001066 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
1067 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
1068
1069 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
1070 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
1071
1072 // Compute window_step_x elements per iteration
1073 int x = window_start_x;
1074 for (; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Lid6d1b362020-09-24 17:34:23 +01001075 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001076 const int32x4x2_t broadcast_v = {{
1077 broadcast_value_vec,
1078 broadcast_value_vec,
1079 }};
1080 const int32x4x2_t non_broadcast_v = {{
SiCong Lid6d1b362020-09-24 17:34:23 +01001081 vld1q_s32(non_broadcast_input_ptr + x),
1082 vld1q_s32(non_broadcast_input_ptr + x + 4),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001083 }};
1084 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
1085
1086 vst1q_s32(output_ptr + x, result.val[0]);
1087 vst1q_s32(output_ptr + x + 4, result.val[1]);
1088 }
1089
1090 // Compute left-over elements
1091 for (; x < window_end_x; ++x)
1092 {
1093 int64_t tmp =
1094 static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
1095
1096 if (tmp >= 0)
1097 {
1098 tmp >>= n;
SiCong Lid6d1b362020-09-24 17:34:23 +01001099 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001100 else
1101 {
1102 uint64_t mask = ((uint64_t)1u << n) - 1;
1103 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1104 }
1105 if (is_sat)
1106 {
1107 tmp = utility::clamp<int64_t, int32_t>(tmp);
1108 }
1109 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Lid6d1b362020-09-24 17:34:23 +01001110 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001111 },
1112 broadcast_input, non_broadcast_input, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001113 }
1114 else
1115 {
1116 // Clear X Dimension on execution window as we handle manually
1117 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1118 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1119
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001120 Iterator input1(src1, input1_win);
1121 Iterator input2(src2, input2_win);
1122 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001123
Omar Al Khatib605a9282022-11-01 17:01:24 +00001124 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001125 win,
1126 [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001127 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001128 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
1129 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
1130 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
1131
1132 // Compute window_step_x elements per iteration
1133 int x = window_start_x;
1134 for (; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Lid6d1b362020-09-24 17:34:23 +01001135 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001136 const int32x4x2_t ta1 = {{
1137 vld1q_s32(input1_ptr + x),
1138 vld1q_s32(input1_ptr + x + 4),
1139 }};
1140 const int32x4x2_t ta2 = {{
1141 vld1q_s32(input2_ptr + x),
1142 vld1q_s32(input2_ptr + x + 4),
1143 }};
1144 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
1145
1146 vst1q_s32(output_ptr + x, result.val[0]);
1147 vst1q_s32(output_ptr + x + 4, result.val[1]);
1148 }
1149
1150 // Compute left-over elements
1151 for (; x < window_end_x; ++x)
1152 {
1153 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
1154
1155 if (tmp >= 0)
SiCong Lid6d1b362020-09-24 17:34:23 +01001156 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001157 tmp >>= n;
SiCong Lid6d1b362020-09-24 17:34:23 +01001158 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001159 else
SiCong Lid6d1b362020-09-24 17:34:23 +01001160 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001161 uint64_t mask = ((uint64_t)1u << n) - 1;
1162 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
SiCong Lid6d1b362020-09-24 17:34:23 +01001163 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001164 if (is_sat)
1165 {
1166 tmp = utility::clamp<int64_t, int32_t>(tmp);
1167 }
1168 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Lid6d1b362020-09-24 17:34:23 +01001169 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001170 },
1171 input1, input2, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001172 }
SiCong Libb88f892020-08-28 11:18:47 +01001173}
1174
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001175void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001176{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001177 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001178 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1179 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001180
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001181 // Clear X Dimension on execution window as we handle manually
1182 Window win = window;
1183 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001184
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001185 constexpr int window_step_x = 8 / sizeof(float);
1186 const auto window_start_x = static_cast<int>(window.x().start());
1187 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001188 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
giuros01154bc1c2019-03-26 17:44:40 +00001189
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001190 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1191
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001192 if (is_broadcast_across_x)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001193 {
1194 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1195 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1196 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001197 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1198 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
giuros01154bc1c2019-03-26 17:44:40 +00001199
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001200 // Clear X Dimension on execution window as we handle manually
1201 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001202
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001203 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1204 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001205 Iterator dst(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001206
Omar Al Khatib605a9282022-11-01 17:01:24 +00001207 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001208 win,
1209 [&](const Coordinates &)
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001210 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001211 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
1212 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001213
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001214 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001215
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001216 // Compute window_step_x elements per iteration
1217 int x = window_start_x;
1218 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1219 {
1220 const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1221 float32x4_t b = vdupq_n_f32(broadcast_value);
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001222
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001223 const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f};
1224 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1225 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1226 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1227 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001228
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001229 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1230 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001231
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001232 float32x4_t res = wrapper::vmul(tmp0, b);
1233 b = wrapper::vmul(b, mask);
1234
1235 res = wrapper::vmla(res, tmp1, b);
1236 wrapper::vstore(output_ptr + 2 * x, res);
1237 }
1238
1239 // Compute left-over elements
1240 for (; x < window_end_x; ++x)
1241 {
1242 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1243 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1244 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1245 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1246 *(output_ptr + 2 * x) = res1;
1247 *(output_ptr + 2 * x + 1) = res2;
1248 }
1249 },
1250 broadcast_input, non_broadcast_input, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001251 }
1252 else
1253 {
1254 // Clear X Dimension on execution window as we handle manually
1255 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1256 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1257
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001258 Iterator input1(src1, input1_win);
1259 Iterator input2(src2, input2_win);
1260 Iterator dst(out, win);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001261
Omar Al Khatib605a9282022-11-01 17:01:24 +00001262 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001263 win,
1264 [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001265 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001266 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1267 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
1268 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001269
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001270 // Compute window_step_x elements per iteration
1271 int x = window_start_x;
1272 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1273 {
1274 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1275 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001276
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001277 const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f};
1278 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1279 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1280 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1281 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001282
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001283 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1284 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001285
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001286 float32x4_t res = wrapper::vmul(tmp0, b);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001287
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001288 b = wrapper::vrev64(b);
1289 b = wrapper::vmul(b, mask);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001290
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001291 res = wrapper::vmla(res, tmp1, b);
1292 wrapper::vstore(output_ptr + 2 * x, res);
1293 }
1294
1295 // Compute left-over elements
1296 for (; x < window_end_x; ++x)
1297 {
1298 const auto a0 = *(input1_ptr + 2 * x);
1299 const auto a1 = *(input1_ptr + 2 * x + 1);
1300 const auto b0 = *(input2_ptr + 2 * x);
1301 const auto b1 = *(input2_ptr + 2 * x + 1);
1302 auto res1 = a0 * b0 - a1 * b1;
1303 auto res2 = a0 * b1 + a1 * b0;
1304 *(output_ptr + 2 * x) = res1;
1305 *(output_ptr + 2 * x + 1) = res2;
1306 }
1307 },
1308 input1, input2, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001309 }
giuros01154bc1c2019-03-26 17:44:40 +00001310}
1311
Pablo Tellodf246182017-07-03 16:25:09 +01001312template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001313void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001314{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001315 // Create input windows
1316 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001317 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1318 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001319
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001320 // Clear X Dimension on execution window as we handle manually
1321 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1322 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1323 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001324
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001325 Iterator input1(src1, input1_win);
1326 Iterator input2(src2, input2_win);
1327 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001328
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001329 const int window_step_x = 16 / sizeof(uint8_t);
1330 const auto window_start_x = static_cast<int>(window.x().start());
1331 const auto window_end_x = static_cast<int>(window.x().end());
1332
Omar Al Khatib605a9282022-11-01 17:01:24 +00001333 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001334 win,
1335 [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001336 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001337 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1338 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
1339 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001340
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001341 // Compute window_step_x elements per iteration
1342 int x = window_start_x;
1343 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001344 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001345 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1346 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001347
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001348 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1349 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1350 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1351 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1352
1353 if (is_scale255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001354 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001355 tmp_low = scale255_U16_U16(tmp_low);
1356 tmp_high = scale255_U16_U16(tmp_high);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001357 }
1358 else
1359 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001360 const int16x8_t vn = vdupq_n_s16(-n);
1361
1362 if (is_sat)
1363 {
1364 tmp_low = vqshlq_u16(tmp_low, vn);
1365 tmp_high = vqshlq_u16(tmp_high, vn);
1366 }
1367 else
1368 {
1369 tmp_low = vshlq_u16(tmp_low, vn);
1370 tmp_high = vshlq_u16(tmp_high, vn);
1371 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001372 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001373
1374 if (is_sat)
1375 {
1376 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1377
1378 tmp_low = vminq_u16(tmp_low, max);
1379 tmp_high = vminq_u16(tmp_high, max);
1380 }
1381
1382 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1383 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001384 }
1385
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001386 // Compute left-over elements
1387 for (; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001388 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001389 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001390
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001391 if (is_scale255)
1392 {
1393 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1394 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1395 }
1396 else
1397 {
1398 tmp >>= n;
1399 }
1400
1401 if (is_sat)
1402 {
1403 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1404 }
1405
1406 *(output_ptr + x) = static_cast<int16_t>(tmp);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001407 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001408 },
1409 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001410}
1411
1412template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001413void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001414{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001415 // Create input windows
1416 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001417 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1418 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001419
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001420 // Clear X Dimension on execution window as we handle manually
1421 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1422 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1423 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001424
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001425 Iterator input1(src1, input1_win);
1426 Iterator input2(src2, input2_win);
1427 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001428
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001429 const int window_step_x = 16;
1430 const auto window_start_x = static_cast<int>(window.x().start());
1431 const auto window_end_x = static_cast<int>(window.x().end());
1432
Omar Al Khatib605a9282022-11-01 17:01:24 +00001433 execute_window_loop(
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001434 win,
1435 [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001436 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001437 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1438 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
1439 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
1440
1441 // Compute window_step_x elements per iteration
1442 int x = window_start_x;
1443 for (; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001444 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001445 const int16x8x2_t ta1 = {{
1446 vld1q_s16(input1_ptr + x),
1447 vld1q_s16(input1_ptr + x + 8),
1448 }};
1449 const uint8x8x2_t ta2u = {{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001450 vld1_u8(input2_ptr + x),
1451 vld1_u8(input2_ptr + x + 8),
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001452 }};
1453 const int16x8x2_t ta2 = {
1454 {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001455
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001456 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001457
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001458 vst1q_s16(output_ptr + x, result.val[0]);
1459 vst1q_s16(output_ptr + x + 8, result.val[1]);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001460 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001461
1462 // Compute left-over elements
1463 for (; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001464 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001465 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1466
1467 if (is_scale255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001468 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001469 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1470
1471 tmp = static_cast<int32_t>(tmp_f + 0.5f);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001472 }
1473 else
1474 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001475 if (tmp >= 0)
1476 {
1477 tmp >>= n;
1478 }
1479 else
1480 {
1481 uint32_t mask = (1u << n) - 1;
1482 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1483 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001484 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001485 if (is_sat)
1486 {
1487 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1488 }
1489 *(output_ptr + x) = static_cast<int16_t>(tmp);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001490 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001491 },
1492 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001493}
1494
1495template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001496void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001497{
1498 // Simply swap the two input buffers
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001499 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001500}
1501} // namespace
1502
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001503void CpuMulKernel::configure(ITensorInfo *src1,
1504 ITensorInfo *src2,
1505 ITensorInfo *dst,
1506 float scale,
1507 ConvertPolicy overflow_policy,
1508 RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001509{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001510 ARM_COMPUTE_UNUSED(rounding_policy);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001511 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001512
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001513 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001514
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001515 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001516
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001517 // Auto initialize dst if not initialized
1518 set_shape_if_empty(*dst, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001519
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001520 _scale = scale;
1521 _scale_exponent = 0;
1522 _func_quantized = nullptr;
1523 _func_int = nullptr;
1524 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001525
1526 bool is_scale_255 = false;
1527 // Check and validate scaling factor
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001528 if (std::abs(scale - scale255_constant) < 0.00001f)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001529 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001530 is_scale_255 = true;
1531 }
1532 else
1533 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001534 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001535
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001536 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001537
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001538 // Store the positive exponent. We know that we compute 1/2^n
1539 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1540 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001541 }
1542
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001543 const DataType dt_input1 = src1->data_type();
1544 const DataType dt_input2 = src2->data_type();
1545 const DataType dt_output = dst->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001546 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1547
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001548 switch (dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001549 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001550 case DataType::QASYMM8:
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001551 if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001552 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001553 if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
Omar Al Khatib605a9282022-11-01 17:01:24 +00001554 {
1555 _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
1556 }
1557 else
1558 {
1559 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1560 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001561 }
1562 break;
1563 case DataType::QASYMM8_SIGNED:
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001564 if (dt_input2 == DataType::QASYMM8_SIGNED)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001565 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001566 if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
Omar Al Khatib605a9282022-11-01 17:01:24 +00001567 {
1568 _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
1569 }
1570 else
1571 {
1572 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1573 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001574 }
1575 break;
1576 case DataType::QSYMM16:
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001577 if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001578 {
1579 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1580 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001581 else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001582 {
1583 _func_int = &mul_QSYMM16_QSYMM16_S32;
1584 }
1585 break;
1586 case DataType::S16:
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001587 if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001588 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001589 if (is_scale_255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001590 {
1591 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1592 }
1593 else
1594 {
1595 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1596 }
1597 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001598 if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001599 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001600 if (is_scale_255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001601 {
1602 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1603 }
1604 else
1605 {
1606 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1607 }
1608 }
1609 break;
SiCong Libb88f892020-08-28 11:18:47 +01001610 case DataType::S32:
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001611 if (DataType::S32 == dt_input2 && DataType::S32 == dt_output)
SiCong Libb88f892020-08-28 11:18:47 +01001612 {
1613 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1614 }
1615 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001616 case DataType::U8:
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001617 if (DataType::U8 == dt_input2 && DataType::U8 == dt_output)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001618 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001619 if (is_scale_255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001620 {
1621 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1622 }
1623 else
1624 {
1625 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1626 }
1627 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001628 else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001629 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001630 if (is_scale_255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001631 {
1632 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1633 }
1634 else
1635 {
1636 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1637 }
1638 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001639 else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001640 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001641 if (is_scale_255)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001642 {
1643 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1644 }
1645 else
1646 {
1647 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1648 }
1649 }
1650 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001651 case DataType::F16:
Pablo Marquez Tello568aab62023-11-20 14:20:01 +00001652 _func_float = REGISTER_FP16_NEON(cpu::mul_F16_F16_F16);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001653 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001654 case DataType::F32:
Pablo Marquez Tello568aab62023-11-20 14:20:01 +00001655 _func_float = REGISTER_FP32_NEON(cpu::mul_F32_F32_F32);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001656 break;
1657 default:
1658 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001659 }
1660
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001661 // Configure kernel window
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01001662 Window win;
1663 std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001664
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001665 ICpuKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001666}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001667
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001668size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
1669{
1670 ARM_COMPUTE_UNUSED(thread_count);
1671
1672#if defined(ENABLE_FP32_KERNELS)
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001673 if (this->_func_float == &mul_F32_F32_F32)
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001674 {
1675 size_t mws = ICPPKernel::default_mws;
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001676 if (platform.get_cpu_model() == CPUModel::N1)
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001677 {
1678 mws = default_mws_N1_fp32_neon;
1679 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001680 else if (platform.get_cpu_model() == CPUModel::V1)
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001681 {
1682 mws = default_mws_V1_fp32_neon;
1683 }
1684 else
1685 {
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001686 if (_split_dimension == Window::DimX)
fadara01e112ef12022-11-22 18:25:55 +00001687 {
1688 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1689 // This number is loosely chosen as threading overhead in each platform varies wildly.
1690 return default_mws_other_platforms_1d_tensor;
1691 }
1692 return default_mws;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001693 }
1694
1695 // tensor is 1D or was re-interpreted as 1D
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001696 if (this->window().shape().num_dimensions() == 1)
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001697 {
1698 return mws;
1699 }
1700 else
1701 {
1702 // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
1703 // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
1704 // but the other sizes are large, which boosts performance.
1705 mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
1706 return std::max(static_cast<size_t>(1), mws);
1707 }
1708 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001709#else /* ENABLE_FP32_KERNELS */
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001710 ARM_COMPUTE_UNUSED(platform);
1711#endif /* ENABLE_FP32_KERNELS */
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001712 if (_split_dimension == Window::DimX)
fadara01e112ef12022-11-22 18:25:55 +00001713 {
1714 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1715 // This number is loosely chosen as threading overhead in each platform varies wildly.
1716 return default_mws_other_platforms_1d_tensor;
1717 }
1718 return default_mws;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001719}
1720
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001721Status CpuMulKernel::validate(const ITensorInfo *src1,
1722 const ITensorInfo *src2,
1723 const ITensorInfo *dst,
1724 float scale,
1725 ConvertPolicy overflow_policy,
1726 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001727{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001728 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1729 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001730
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001731 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001732}
1733
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001734void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001735{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001736 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001737 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001738 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001739
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001740 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1741 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1742 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001743
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001744 if (_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001745 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001746 (*_func_quantized)(src1, src2, dst, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001747 }
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001748 else if (_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001749 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001750 (*_func_int)(src1, src2, dst, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001751 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001752 else
1753 {
1754 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001755 (*_func_float)(src1, src2, dst, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001756 }
1757}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00001758
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001759const char *CpuMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001760{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001761 return "CpuMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001762}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00001763
giuros01154bc1c2019-03-26 17:44:40 +00001764namespace
1765{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001766Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001767{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001768 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
1769 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
giuros01154bc1c2019-03-26 17:44:40 +00001770
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001771 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001772
1773 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
1774
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001775 // Validate in case of configured dst
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001776 if (dst->total_size() > 0)
giuros01154bc1c2019-03-26 17:44:40 +00001777 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001778 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
Felix Thomasmathibalanafd38f02023-09-27 17:46:17 +01001779 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
1780 "Wrong shape for dst");
giuros01154bc1c2019-03-26 17:44:40 +00001781 }
1782
1783 return Status{};
1784}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001785} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00001786
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001787void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001788{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001789 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1790 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001791
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001792 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001793
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001794 // Auto initialize dst if not initialized
1795 const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
1796 auto_init_if_empty(*dst, out_info);
giuros01154bc1c2019-03-26 17:44:40 +00001797
giuros01154bc1c2019-03-26 17:44:40 +00001798 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00001799 Window win = calculate_max_window(out_shape);
giuros01154bc1c2019-03-26 17:44:40 +00001800
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001801 ICpuKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00001802}
1803
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001804Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001805{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001806 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1807 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
giuros01154bc1c2019-03-26 17:44:40 +00001808
1809 return Status{};
1810}
1811
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001812void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00001813{
1814 ARM_COMPUTE_UNUSED(info);
1815 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001816 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
giuros01154bc1c2019-03-26 17:44:40 +00001817
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001818 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1819 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1820 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001821
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001822 c_mul_F32_F32_F32_n(src1, src2, dst, window);
giuros01154bc1c2019-03-26 17:44:40 +00001823}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001824
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001825const char *CpuComplexMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001826{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001827 return "CpuComplexMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001828}
1829} // namespace kernels
1830} // namespace cpu
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001831} // namespace arm_compute