blob: cc7efe0a1df133463da95903120e4f702ed52bd5 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01002 * Copyright (c) 2016-2022 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuMulKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Sheri Zhang1e3ab422021-03-16 17:35:08 +000026#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/TensorInfo.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NESymm.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
35#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000037namespace
38{
39 static constexpr size_t default_mws_N1_fp32_neon = 22447;
40 static constexpr size_t default_mws_V1_fp32_neon = 38982;
fadara01e112ef12022-11-22 18:25:55 +000041 static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000042}
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043namespace arm_compute
44{
Sheri Zhang1e3ab422021-03-16 17:35:08 +000045namespace cpu
46{
47namespace kernels
48{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010049namespace
50{
51const float scale255_constant = 1.f / 255.f;
52const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
53const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
54
Sheri Zhang1e3ab422021-03-16 17:35:08 +000055inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000056{
57 ARM_COMPUTE_UNUSED(overflow_policy);
58 ARM_COMPUTE_UNUSED(rounding_policy);
59
Sheri Zhang1e3ab422021-03-16 17:35:08 +000060 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
61 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010062 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000063 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010064 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000065 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio9428a182020-03-30 14:10:20 +010066 DataType::S16, DataType::QSYMM16,
67 DataType::S32, DataType::F16, DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000068 if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000069 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000070 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000071 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000072 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000073
Sheri Zhang1e3ab422021-03-16 17:35:08 +000074 if(dst->total_size() > 0)
Manuel Bottini79fa9a22019-02-22 17:54:22 +000075 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000076 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
77 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000078 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010079 // clang-format off
80 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Sheri Zhang1e3ab422021-03-16 17:35:08 +000081 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
82 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
83 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
84 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
85 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
86 !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
SiCong Libb88f892020-08-28 11:18:47 +010087 , "Invalid data type combination");
88 // clang-format on
Sheri Zhang1e3ab422021-03-16 17:35:08 +000089 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000090 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000091
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000092 if(std::abs(scale - scale255_constant) < 0.00001f)
93 {
94 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000095 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
96 "Scale == 1/255 is not supported if input and dst are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000097 }
98 else
99 {
100 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
101
102 int exponent = 0;
103 const float normalized_mantissa = std::frexp(scale, &exponent);
104
105 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
106 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
107 // Moreover, it will be negative as we deal with 1/2^n
108 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
109 }
110
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000111 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000112}
113
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100114/* Scales a given vector by 1/255.
115 *
116 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
117 *
118 * @param in Input vector to scale.
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000119 * @return Scaled dst rounded to nearest (round half up).
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100120 */
121inline int32x4_t scale255_S32_S32(int32x4_t in)
122{
123 // Scale
124 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
125 // Round to nearest (round half up)
126 // Add +0.5 for all values
127 // Afterwards vcvt rounds toward zero
128 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
129}
130
131inline uint16x8_t scale255_U16_U16(uint16x8_t in)
132{
133 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
134 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
135 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
136}
137
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100138template <typename T>
139inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
140vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000141{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100142 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000143}
144
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100145template <typename T>
146inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
147vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000148{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100149 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000150}
151
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100152template <typename T>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000153void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100154{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100155 // Create input windows
156 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000157 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
158 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100159
160 // Clear X Dimension on execution window as we handle manually
161 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100162
Sheri Zhanga449a362020-07-16 15:52:25 +0100163 const int window_step_x = 16 / sizeof(T);
164 const auto window_start_x = static_cast<int>(window.x().start());
165 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000166 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100167
Sheri Zhanga449a362020-07-16 15:52:25 +0100168 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
169 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100170
Sheri Zhanga449a362020-07-16 15:52:25 +0100171 if(is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100172 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100173 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
174 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
175 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000176 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
177 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhanga449a362020-07-16 15:52:25 +0100178 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
179 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100180
Sheri Zhanga449a362020-07-16 15:52:25 +0100181 // Clear X Dimension on execution window as we handle manually
182 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
183
184 Iterator broadcast_input(broadcast_tensor, broadcast_win);
185 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000186 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100187
188 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
189
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000190 execute_window_loop(
191 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100192 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100193 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000194 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100195
Sheri Zhanga449a362020-07-16 15:52:25 +0100196 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
197 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100198
Sheri Zhanga449a362020-07-16 15:52:25 +0100199 // Compute window_step_x elements per iteration
200 int x = window_start_x;
201 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100202 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100203 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100204
Sheri Zhanga449a362020-07-16 15:52:25 +0100205 // Dequantize inputs
206 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
207 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100208
Sheri Zhanga449a362020-07-16 15:52:25 +0100209 const float32x4x4_t out_f32x4x4 =
210 {
211 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
212 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
213 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
214 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
215 };
216
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000217 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100218 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
219 wrapper::vstore(output_ptr + x, result);
220 }
221
222 // Compute left-over elements
223 for(; x < window_end_x; ++x)
224 {
225 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000226 const T src1 = *(non_broadcast_input_ptr + x);
227 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100228 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
229 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100230
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000231 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100232 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100233 *(output_ptr + x) = tmp_qua;
234 }
235 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000236 broadcast_input, non_broadcast_input, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100237 }
238 else
239 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000240 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
241 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhanga449a362020-07-16 15:52:25 +0100242
243 // Clear X Dimension on execution window as we handle manually
244 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
245 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
246
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000247 Iterator input1(src1, input1_win);
248 Iterator input2(src2, input2_win);
249 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100250
Omar Al Khatib605a9282022-11-01 17:01:24 +0000251 execute_window_loop(
252 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100253 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100254 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
255 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000256 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100257
Sheri Zhanga449a362020-07-16 15:52:25 +0100258 // Compute window_step_x elements per iteration
259 int x = window_start_x;
260 for(; x <= (window_end_x - window_step_x); x += window_step_x)
261 {
262 const auto input1_q = wrapper::vloadq(input1_ptr + x);
263 const auto input2_q = wrapper::vloadq(input2_ptr + x);
264
265 // Dequantize inputs
266 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
267 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
268
269 const float32x4x4_t out_f32x4x4 =
270 {
271 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
272 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
273 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
274 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
275 };
276
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000277 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100278 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
279 wrapper::vstore(output_ptr + x, result);
280 }
281
282 // Compute left-over elements
283 for(; x < window_end_x; ++x)
284 {
285 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000286 const T src1 = *(input1_ptr + x);
287 const T src2 = *(input2_ptr + x);
288 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
289 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100290 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100291
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000292 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100293 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100294 *(output_ptr + x) = tmp_qua;
295 }
296 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000297 input1, input2, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100298 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100299}
300
Omar Al Khatib605a9282022-11-01 17:01:24 +0000301bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
302{
303 const auto iq0 = src0->quantization_info().uniform();
304 const auto iq1 = src1->quantization_info().uniform();
305 const auto oq = dst->quantization_info().uniform();
306
307 const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
308
309 if(multiplier < -8191.f || multiplier > 8191.f)
310 {
311 //The multiplier cannot be stored as a 14.18 signed fixed-point number
312 return false;
313 }
314
315 const auto offset_out = float(oq.offset);
316
317 const auto max_result = multiplier * (256) * (256) + offset_out;
318
319 if(max_result > 8191.f)
320 {
321 //It might not be possible to store the result as a 14.18 signed fixed-point number.
322 return false;
323 }
324
325 return true;
326}
327
328template <typename ScalarType>
329void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)
330{
331 const auto in0_info = src0->info();
332 const auto in1_info = src1->info();
333
334 const auto &in0_shape = in0_info->tensor_shape();
335 const auto &in1_shape = in1_info->tensor_shape();
336
337 // Create input windows.
338 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
339 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
340
341 // Clear the x dimension on the execution window as we process the whole row each iteration.
342 Window win = window;
343 win.set(Window::DimX, Window::Dimension(0, 1, 1));
344
345 constexpr int window_step_x = 16;
346 const auto window_start_x = window.x().start();
347 const auto window_end_x = window.x().end();
348 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
349
350 const auto iq0_info = in0_info->quantization_info().uniform();
351 const auto iq1_info = in1_info->quantization_info().uniform();
352 const auto oq_info = dst->info()->quantization_info().uniform();
353
354 const auto in0_offset = iq0_info.offset;
355 const auto in1_offset = iq1_info.offset;
356 const auto out_offset = oq_info.offset;
357 const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;
358
359 constexpr int32_t two_pwr18i = 262144;
360 constexpr float two_pwr18f = 262144.f;
361
362 const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);
363 const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);
364 const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
365 const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
366
367 if(is_broadcast_across_x)
368 {
369 // Prefix: a = non-broadcast, b = broadcast.
370
371 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
372 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
373 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
374 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
375 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
376
377 const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
378 const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
379#ifndef __aarch64__
380 const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
381 const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
382#endif //__aarch64__
383 const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
384
385 // Clear the x dimension on the execution window as we process the whole row each iteration.
386 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
387
388 Iterator a_input_it(a_tensor, a_win);
389 Iterator b_input_it(b_tensor, b_win);
390 Iterator out_it(dst, win);
391
392 execute_window_loop(
393 win, [&](const Coordinates &)
394 {
395 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
396 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
397 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
398
399 const auto b_val = *b_ptr;
400 const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
401 const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
402
403 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
404 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
405
406 int x = window_start_x;
407
408 for(; x <= (window_end_x - window_step_x); x += window_step_x)
409 {
410 // Load the inputs.
411 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
412
413 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
414 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
415 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
416
417 const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
418 const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
419 const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
420 const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
421
422 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
423 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
424 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
425 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
426
427 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
428 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
429 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
430 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
431
432 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
433 const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
434 const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
435 const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
436 const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
437
438 const auto vout_15p1_0 = wrapper::vcombine(
439 vout_15p1_00,
440 vout_15p1_01);
441
442 const auto vout_15p1_1 = wrapper::vcombine(
443 vout_15p1_10,
444 vout_15p1_11);
445 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
446
447 const auto vout_8p0 = wrapper::vcombine(
448 wrapper::vqrshrn<2>(vout_15p1_0),
449 wrapper::vqrshrn<2>(vout_15p1_1));
450 wrapper::vstore(out_ptr + x, vout_8p0);
451 }
452
453 //Process the left-over elements.
454 for(; x < window_end_x; ++x)
455 {
456#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000457 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000458 b_val) - b_offset_16p0)) + out_offset_14p18)));
459#else //__aarch64__
460 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
461#endif //__aarch64__
462 }
463 },
464 a_input_it, b_input_it, out_it);
465 }
466 else
467 {
468 const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
469 const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
470 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
471 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
472
473 // Clear the x dimension on the execution window as we process the whole row each iteration.
474 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
475 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
476
477 Iterator in0_it(src0, in0_win);
478 Iterator in1_it(src1, in1_win);
479 Iterator out_it(dst, win);
480
481 execute_window_loop(
482 win, [&](const Coordinates &)
483 {
484 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
485 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
486 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
487
488 int x = window_start_x;
489
490 for(; x <= (window_end_x - window_step_x); x += window_step_x)
491 {
492 // Load the inputs.
493 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
494 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
495
496 // Widen the input elements to signed 16-bit regardless of the input signedness.
497 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
498 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
499 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
500 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
501
502 const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
503 const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
504 const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
505 const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
506
507 const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
508 const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
509 const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
510 const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
511
512 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
513 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
514 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
515 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
516
517 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
518 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
519 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
520 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
521
522 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
523 const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
524 const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
525 const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
526 const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
527
528 const auto vout_14p2_0 = wrapper::vcombine(
529 vout_14p2_00,
530 vout_14p2_01);
531
532 const auto vout_14p2_1 = wrapper::vcombine(
533 vout_14p2_10,
534 vout_14p2_11);
535
536 const auto vout_8p0 = wrapper::vcombine(
537 wrapper::vqrshrn<2>(vout_14p2_0),
538 wrapper::vqrshrn<2>(vout_14p2_1));
539 wrapper::vstore(out_ptr + x, vout_8p0);
540 }
541
542 //Process the left-over elements.
543 for(; x < window_end_x; ++x)
544 {
545#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000546 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000547 in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
548#else //__aarch64__
549 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
550#endif //__aarch64__
551 }
552 },
553 in0_it, in1_it, out_it);
554 }
555}
556
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000557void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100558{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000559 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
560 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100561 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
562
563 // Create input windows
564 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000565 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
566 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100567
568 // Clear X Dimension on execution window as we handle manually
569 win.set(Window::DimX, Window::Dimension(0, 1, 1));
570 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
571 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
572
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000573 Iterator input1(src1, input1_win);
574 Iterator input2(src2, input2_win);
575 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100576
577 const int window_step_x = 16;
578 const auto window_start_x = static_cast<int>(window.x().start());
579 const auto window_end_x = static_cast<int>(window.x().end());
580
581 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
582
Omar Al Khatib605a9282022-11-01 17:01:24 +0000583 execute_window_loop(
584 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100585 {
586 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
587 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000588 const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100589
590 // Compute window_step_x elements per iteration
591 int x = window_start_x;
592 for(; x <= (window_end_x - window_step_x); x += window_step_x)
593 {
594 const qsymm16x8x2_t input1_q =
595 {
596 {
597 vld1q_s16(input1_ptr + x),
598 vld1q_s16(input1_ptr + x + 8),
599 }
600 };
601 const qsymm16x8x2_t input2_q =
602 {
603 {
604 vld1q_s16(input2_ptr + x),
605 vld1q_s16(input2_ptr + x + 8),
606 }
607 };
608
609 // Dequantize inputs
610 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
611 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
612
613 const float32x4x4_t out_f32x4x4 =
614 {
615 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
616 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
617 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
618 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
619 };
620
621 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
622 vst1q_s16(output_ptr + x, result.val[0]);
623 vst1q_s16(output_ptr + x + 8, result.val[1]);
624 }
625
626 // Compute left-over elements
627 for(; x < window_end_x; ++x)
628 {
629 // Dequantize inputs
630 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
631 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
632 float tmp_f = tmp_in1 * tmp_in2;
633
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000634 // Quantize dst, lrintf() has same rounding mode as vcombine_s16
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100635 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
636 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
637 *(output_ptr + x) = tmp_qua;
638 }
639 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000640 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100641}
642
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000643void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100644{
645 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100646
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100647 // Create input windows
648 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000649 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
650 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100651
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100652 // Clear X Dimension on execution window as we handle manually
653 win.set(Window::DimX, Window::Dimension(0, 1, 1));
654 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
655 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100656
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000657 Iterator input1(src1, input1_win);
658 Iterator input2(src2, input2_win);
659 Iterator dst(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100660
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100661 const int window_step_x = 16;
662 const auto window_start_x = static_cast<int>(window.x().start());
663 const auto window_end_x = static_cast<int>(window.x().end());
664
Omar Al Khatib605a9282022-11-01 17:01:24 +0000665 execute_window_loop(
666 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100667 {
668 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
669 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000670 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100671
672 // Compute window_step_x elements per iteration
673 int x = window_start_x;
674 for(; x <= (window_end_x - window_step_x); x += window_step_x)
675 {
676 const qsymm16x8x2_t input1_q =
677 {
678 {
679 vld1q_s16(input1_ptr + x),
680 vld1q_s16(input1_ptr + x + 8),
681 }
682 };
683 const qsymm16x8x2_t input2_q =
684 {
685 {
686 vld1q_s16(input2_ptr + x),
687 vld1q_s16(input2_ptr + x + 8),
688 }
689 };
690
691 const int32x4x4_t in1_s32 =
692 {
693 {
694 vmovl_s16(vget_low_s16(input1_q.val[0])),
695 vmovl_s16(vget_high_s16(input1_q.val[0])),
696 vmovl_s16(vget_low_s16(input1_q.val[1])),
697 vmovl_s16(vget_high_s16(input1_q.val[1])),
698 }
699 };
700 const int32x4x4_t in2_s32 =
701 {
702 {
703 vmovl_s16(vget_low_s16(input2_q.val[0])),
704 vmovl_s16(vget_high_s16(input2_q.val[0])),
705 vmovl_s16(vget_low_s16(input2_q.val[1])),
706 vmovl_s16(vget_high_s16(input2_q.val[1])),
707 }
708 };
709
710 const int32x4x4_t result =
711 {
712 {
713 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
714 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
715 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
716 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
717 }
718 };
719
720 vst1q_s32(output_ptr + x, result.val[0]);
721 vst1q_s32(output_ptr + x + 4, result.val[1]);
722 vst1q_s32(output_ptr + x + 8, result.val[2]);
723 vst1q_s32(output_ptr + x + 12, result.val[3]);
724 }
725
726 // Compute left-over elements
727 for(; x < window_end_x; ++x)
728 {
729 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
730 *(output_ptr + x) = tmp;
731 }
732 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000733 input1, input2, dst);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100734}
735
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100736template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000737void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100738{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100739 // Create input windows
740 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000741 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
742 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100743
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100744 // Clear X Dimension on execution window as we handle manually
745 win.set(Window::DimX, Window::Dimension(0, 1, 1));
746 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
747 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100748
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000749 Iterator input1(src1, input1_win);
750 Iterator input2(src2, input2_win);
751 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100752
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100753 const int window_step_x = 16 / sizeof(uint8_t);
754 const auto window_start_x = static_cast<int>(window.x().start());
755 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100756
Omar Al Khatib605a9282022-11-01 17:01:24 +0000757 execute_window_loop(
758 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100759 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100760 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
761 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000762 const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100763
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100764 // Compute window_step_x elements per iteration
765 int x = window_start_x;
766 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100767 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100768 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
769 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100770
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100771 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
772 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
773 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
774 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
775
776 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
777 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
778
779 if(is_scale255)
780 {
781 tmp1_high = scale255_U16_U16(tmp1_high);
782 tmp1_low = scale255_U16_U16(tmp1_low);
783 }
784 else
785 {
786 const int16x8_t vn = vdupq_n_s16(-n);
787
788 if(is_sat)
789 {
790 tmp1_high = vqshlq_u16(tmp1_high, vn);
791 tmp1_low = vqshlq_u16(tmp1_low, vn);
792 }
793 else
794 {
795 tmp1_high = vshlq_u16(tmp1_high, vn);
796 tmp1_low = vshlq_u16(tmp1_low, vn);
797 }
798 }
799 if(is_sat)
800 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100801 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100802 }
803 else
804 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100805 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100806 }
807 }
808
809 // Compute left-over elements
810 for(; x < window_end_x; ++x)
811 {
812 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
813
814 if(is_scale255)
815 {
816 float tmp_f = static_cast<float>(tmp) * scale255_constant;
817 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
818 }
819 else
820 {
821 tmp >>= n;
822 }
823 if(is_sat && tmp > 255)
824 {
825 tmp = 255;
826 }
827 *(output_ptr + x) = static_cast<uint8_t>(tmp);
828 }
829 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000830 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100831}
832
833template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000834inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100835{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000836 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
837 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
838 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
839 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100840
841 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
842 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
843
844 if(is_scale255)
845 {
846 tmp1_high = scale255_S32_S32(tmp1_high);
847 tmp1_low = scale255_S32_S32(tmp1_low);
848 }
849 else
850 {
851 // Right shift amount
852 const int32x4_t vn = vdupq_n_s32(-n);
853 // Left shift amount
854 const int32x4_t vnl = vdupq_n_s32(n);
855 // Calculate conversion bit
856 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
857 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
858 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
859 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
860 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
861 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
862 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
863 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
864 if(is_sat)
865 {
866 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
867 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
868 }
869 else
870 {
871 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
872 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
873 }
874 }
875
876 if(is_sat)
877 {
878 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
879 }
880 else
881 {
882 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
883 }
884}
885
886template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000887inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100888{
889 const int16x8x2_t result =
890 {
891 {
892 // First 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000893 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100894 // Second 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000895 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100896 }
897 };
898
899 return result;
900}
901
902template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000903void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100904{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100905 // Create input windows
906 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000907 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
908 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100909
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100910 // Clear X Dimension on execution window as we handle manually
911 win.set(Window::DimX, Window::Dimension(0, 1, 1));
912 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
913 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100914
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000915 Iterator input1(src1, input1_win);
916 Iterator input2(src2, input2_win);
917 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100918
919 const int window_step_x = 16;
920 const auto window_start_x = static_cast<int>(window.x().start());
921 const auto window_end_x = static_cast<int>(window.x().end());
922
Omar Al Khatib605a9282022-11-01 17:01:24 +0000923 execute_window_loop(
924 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100925 {
926 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
927 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000928 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100929
930 // Compute window_step_x elements per iteration
931 int x = window_start_x;
932 for(; x <= (window_end_x - window_step_x); x += window_step_x)
933 {
934 const int16x8x2_t ta1 =
935 {
936 {
937 vld1q_s16(input1_ptr + x),
938 vld1q_s16(input1_ptr + x + 8),
939 }
940 };
941 const int16x8x2_t ta2 =
942 {
943 {
944 vld1q_s16(input2_ptr + x),
945 vld1q_s16(input2_ptr + x + 8),
946 }
947 };
948 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
949
950 vst1q_s16(output_ptr + x, result.val[0]);
951 vst1q_s16(output_ptr + x + 8, result.val[1]);
952 }
953
954 // Compute left-over elements
955 for(; x < window_end_x; ++x)
956 {
957 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
958
959 if(is_scale255)
960 {
961 float tmp_f = static_cast<float>(tmp) * scale255_constant;
962
963 tmp = static_cast<int32_t>(tmp_f + 0.5f);
964 }
965 else
966 {
967 if(tmp >= 0)
968 {
969 tmp >>= n;
970 }
971 else
972 {
973 uint32_t mask = (1u << n) - 1;
974 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
975 }
976 }
977 if(is_sat)
978 {
979 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
980 }
981 *(output_ptr + x) = static_cast<int16_t>(tmp);
982 }
983 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000984 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100985}
986
Omar Al Khatib605a9282022-11-01 17:01:24 +0000987template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000988inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100989{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000990 const int32x2_t input1_1 = vget_low_s32(src1);
991 const int32x2_t input2_1 = vget_low_s32(src2);
992 const int32x2_t input1_2 = vget_high_s32(src1);
993 const int32x2_t input2_2 = vget_high_s32(src2);
SiCong Libb88f892020-08-28 11:18:47 +0100994
995 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
996 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
997
998 // Apply scaling, conversion and rounding (round to zero)
999 // Right shift amount
1000 const int64x2_t vn = vdupq_n_s64(-n);
1001 // Left shift amount
1002 const int64x2_t vnl = vdupq_n_s64(n);
1003 // Calculate conversion bit
1004 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
1005 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
1006 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
1007 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
1008
1009 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
1010 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
1011 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
1012 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
1013 if(is_sat)
1014 {
1015 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1016 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1017 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
1018 }
1019 else
1020 {
1021 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1022 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1023 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
1024 }
1025}
1026
Omar Al Khatib605a9282022-11-01 17:01:24 +00001027template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001028inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001029{
1030 const int32x4x2_t result =
1031 {
1032 {
1033 // First 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001034 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
SiCong Libb88f892020-08-28 11:18:47 +01001035 // Second 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001036 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
SiCong Libb88f892020-08-28 11:18:47 +01001037 }
1038 };
1039
1040 return result;
1041}
1042
1043template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001044void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001045{
1046 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001047 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1048 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
SiCong Libb88f892020-08-28 11:18:47 +01001049
1050 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +01001051 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +01001052 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +01001053
SiCong Lid6d1b362020-09-24 17:34:23 +01001054 const int window_step_x = 8;
1055 const auto window_start_x = static_cast<int>(window.x().start());
1056 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001057 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
SiCong Libb88f892020-08-28 11:18:47 +01001058
SiCong Lid6d1b362020-09-24 17:34:23 +01001059 if(is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +01001060 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001061 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1062 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1063 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001064 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1065 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
SiCong Libb88f892020-08-28 11:18:47 +01001066
SiCong Lid6d1b362020-09-24 17:34:23 +01001067 // Clear X Dimension on execution window as we handle manually
1068 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1069
1070 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1071 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001072 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001073
Omar Al Khatib605a9282022-11-01 17:01:24 +00001074 execute_window_loop(
1075 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001076 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001077 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001078 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001079
SiCong Lid6d1b362020-09-24 17:34:23 +01001080 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
1081 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
SiCong Libb88f892020-08-28 11:18:47 +01001082
SiCong Lid6d1b362020-09-24 17:34:23 +01001083 // Compute window_step_x elements per iteration
1084 int x = window_start_x;
1085 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1086 {
1087 const int32x4x2_t broadcast_v =
1088 {
1089 {
1090 broadcast_value_vec,
1091 broadcast_value_vec,
1092 }
1093 };
1094 const int32x4x2_t non_broadcast_v =
1095 {
1096 {
1097 vld1q_s32(non_broadcast_input_ptr + x),
1098 vld1q_s32(non_broadcast_input_ptr + x + 4),
1099 }
1100 };
1101 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
1102
1103 vst1q_s32(output_ptr + x, result.val[0]);
1104 vst1q_s32(output_ptr + x + 4, result.val[1]);
1105 }
1106
1107 // Compute left-over elements
1108 for(; x < window_end_x; ++x)
1109 {
1110 int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
1111
1112 if(tmp >= 0)
1113 {
1114 tmp >>= n;
1115 }
1116 else
1117 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001118 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001119 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1120 }
1121 if(is_sat)
1122 {
1123 tmp = utility::clamp<int64_t, int32_t>(tmp);
1124 }
1125 *(output_ptr + x) = static_cast<int32_t>(tmp);
1126 }
1127 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001128 broadcast_input, non_broadcast_input, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001129 }
1130 else
1131 {
1132 // Clear X Dimension on execution window as we handle manually
1133 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1134 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1135
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001136 Iterator input1(src1, input1_win);
1137 Iterator input2(src2, input2_win);
1138 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001139
Omar Al Khatib605a9282022-11-01 17:01:24 +00001140 execute_window_loop(
1141 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001142 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001143 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
1144 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001145 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001146
SiCong Lid6d1b362020-09-24 17:34:23 +01001147 // Compute window_step_x elements per iteration
1148 int x = window_start_x;
1149 for(; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Libb88f892020-08-28 11:18:47 +01001150 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001151 const int32x4x2_t ta1 =
1152 {
1153 {
1154 vld1q_s32(input1_ptr + x),
1155 vld1q_s32(input1_ptr + x + 4),
1156 }
1157 };
1158 const int32x4x2_t ta2 =
1159 {
1160 {
1161 vld1q_s32(input2_ptr + x),
1162 vld1q_s32(input2_ptr + x + 4),
1163 }
1164 };
1165 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
1166
1167 vst1q_s32(output_ptr + x, result.val[0]);
1168 vst1q_s32(output_ptr + x + 4, result.val[1]);
SiCong Libb88f892020-08-28 11:18:47 +01001169 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001170
1171 // Compute left-over elements
1172 for(; x < window_end_x; ++x)
SiCong Libb88f892020-08-28 11:18:47 +01001173 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001174 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
1175
1176 if(tmp >= 0)
1177 {
1178 tmp >>= n;
1179 }
1180 else
1181 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001182 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001183 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1184 }
1185 if(is_sat)
1186 {
1187 tmp = utility::clamp<int64_t, int32_t>(tmp);
1188 }
1189 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Libb88f892020-08-28 11:18:47 +01001190 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001191 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001192 input1, input2, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001193 }
SiCong Libb88f892020-08-28 11:18:47 +01001194}
1195
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001196void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001197{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001198 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001199 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1200 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001201
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001202 // Clear X Dimension on execution window as we handle manually
1203 Window win = window;
1204 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1205
1206 constexpr int window_step_x = 16 / sizeof(float);
1207 const auto window_start_x = static_cast<int>(window.x().start());
1208 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001209 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001210
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001211 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
1212
1213 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001214 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001215 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1216 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1217 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001218 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1219 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001220
1221 // Clear X Dimension on execution window as we handle manually
1222 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1223
1224 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1225 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001226 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001227
Omar Al Khatib605a9282022-11-01 17:01:24 +00001228 execute_window_loop(
1229 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001230 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001231 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001232 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001233
1234 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1235 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
1236 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1237
1238 // Compute window_step_x elements per iteration
1239 int x = window_start_x;
1240 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1241 {
1242 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
1243 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
1244 wrapper::vstore(output_ptr + x, res);
1245 }
1246
1247 // Compute left-over elements
1248 for(; x < window_end_x; ++x)
1249 {
1250 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1251 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1252 }
1253 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001254 broadcast_input, non_broadcast_input, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001255 }
1256 else
1257 {
1258 // Clear X Dimension on execution window as we handle manually
1259 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1260 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1261
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001262 Iterator input1(src1, input1_win);
1263 Iterator input2(src2, input2_win);
1264 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001265
Omar Al Khatib605a9282022-11-01 17:01:24 +00001266 execute_window_loop(
1267 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001268 {
1269 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1270 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001271 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001272
1273 // Compute window_step_x elements per iteration
1274 int x = window_start_x;
1275 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1276 {
1277 const auto ta1 = wrapper::vloadq(input1_ptr + x);
1278 const auto ta2 = wrapper::vloadq(input2_ptr + x);
1279 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1280 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1281 wrapper::vstore(output_ptr + x, res);
1282 }
1283
1284 // Compute left-over elements
1285 for(; x < window_end_x; ++x)
1286 {
1287 const auto ta1 = *(input1_ptr + x);
1288 const auto ta2 = *(input2_ptr + x);
1289 *(output_ptr + x) = ta1 * ta2 * scale;
1290 }
1291 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001292 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001293 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294}
1295
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001296void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001297{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001298 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001299 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1300 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001301
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001302 // Clear X Dimension on execution window as we handle manually
1303 Window win = window;
1304 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001305
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001306 constexpr int window_step_x = 8 / sizeof(float);
1307 const auto window_start_x = static_cast<int>(window.x().start());
1308 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001309 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
giuros01154bc1c2019-03-26 17:44:40 +00001310
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001311 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1312
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001313 if(is_broadcast_across_x)
1314 {
1315 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1316 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1317 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001318 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1319 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
giuros01154bc1c2019-03-26 17:44:40 +00001320
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001321 // Clear X Dimension on execution window as we handle manually
1322 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001323
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001324 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1325 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001326 Iterator dst(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001327
Omar Al Khatib605a9282022-11-01 17:01:24 +00001328 execute_window_loop(
1329 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001330 {
1331 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001332 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001333
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001334 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1335
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001336 // Compute window_step_x elements per iteration
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001337 int x = window_start_x;
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001338 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1339 {
1340 const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1341 float32x4_t b = vdupq_n_f32(broadcast_value);
1342
1343 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1344 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1345 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1346 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1347 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1348
1349 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1350 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1351
1352 float32x4_t res = wrapper::vmul(tmp0, b);
1353 b = wrapper::vmul(b, mask);
1354
1355 res = wrapper::vmla(res, tmp1, b);
1356 wrapper::vstore(output_ptr + 2 * x, res);
1357 }
1358
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001359 // Compute left-over elements
1360 for(; x < window_end_x; ++x)
1361 {
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001362 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1363 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1364 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1365 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1366 *(output_ptr + 2 * x) = res1;
1367 *(output_ptr + 2 * x + 1) = res2;
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001368 }
1369 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001370 broadcast_input, non_broadcast_input, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001371 }
1372 else
1373 {
1374 // Clear X Dimension on execution window as we handle manually
1375 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1376 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1377
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001378 Iterator input1(src1, input1_win);
1379 Iterator input2(src2, input2_win);
1380 Iterator dst(out, win);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001381
Omar Al Khatib605a9282022-11-01 17:01:24 +00001382 execute_window_loop(
1383 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001384 {
1385 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1386 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001387 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001388
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001389 // Compute window_step_x elements per iteration
1390 int x = window_start_x;
1391 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1392 {
1393 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1394 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1395
1396 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1397 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1398 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1399 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1400 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1401
1402 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1403 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1404
1405 float32x4_t res = wrapper::vmul(tmp0, b);
1406
1407 b = wrapper::vrev64(b);
1408 b = wrapper::vmul(b, mask);
1409
1410 res = wrapper::vmla(res, tmp1, b);
1411 wrapper::vstore(output_ptr + 2 * x, res);
1412 }
1413
1414 // Compute left-over elements
1415 for(; x < window_end_x; ++x)
1416 {
1417 const auto a0 = *(input1_ptr + 2 * x);
1418 const auto a1 = *(input1_ptr + 2 * x + 1);
1419 const auto b0 = *(input2_ptr + 2 * x);
1420 const auto b1 = *(input2_ptr + 2 * x + 1);
1421 auto res1 = a0 * b0 - a1 * b1;
1422 auto res2 = a0 * b1 + a1 * b0;
1423 *(output_ptr + 2 * x) = res1;
1424 *(output_ptr + 2 * x + 1) = res2;
1425 }
1426 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001427 input1, input2, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001428 }
giuros01154bc1c2019-03-26 17:44:40 +00001429}
1430
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001431#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001432void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001433{
1434 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001435 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1436 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001437
1438 // Clear X Dimension on execution window as we handle manually
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001439 Window win = window;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001440 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001441 constexpr int window_step_x = 16;
1442 const auto window_start_x = static_cast<int>(window.x().start());
1443 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001444 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001445 if(is_broadcast_across_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001446 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001447 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1448 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1449 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001450 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1451 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001452 // Clear X Dimension on execution window as we handle manually
1453 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1454 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1455 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001456 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001457 execute_window_loop(
1458 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001459 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001460 const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001461 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001462 const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
Omar Al Khatib605a9282022-11-01 17:01:24 +00001463 const float16x8x2_t broadcast_value_vec =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001464 {
1465 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001466 vdupq_n_f16(broadcast_value),
1467 vdupq_n_f16(broadcast_value),
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001468 }
1469 };
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001470 const auto scale_vec = vdupq_n_f16(scale);
1471 // Compute window_step_x elements per iteration
1472 int x = window_start_x;
1473 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001474 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001475 const float16x8x2_t non_broadcast_v =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001476 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001477 {
1478 vld1q_f16(non_broadcast_input_ptr + x),
1479 vld1q_f16(non_broadcast_input_ptr + x + 8),
1480 }
1481 };
1482 const float16x8x2_t result =
1483 {
1484 {
1485 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
1486 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
1487 }
1488 };
1489 vst1q_f16(output_ptr + x, result.val[0]);
1490 vst1q_f16(output_ptr + x + 8, result.val[1]);
1491 }
1492 // Compute left-over elements
1493 for(; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001494 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001495 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1496 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1497 }
1498 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001499 broadcast_input, non_broadcast_input, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001500 }
1501 else
1502 {
1503 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1504 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001505 Iterator input1(src1, input1_win);
1506 Iterator input2(src2, input2_win);
1507 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001508 execute_window_loop(
1509 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001510 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001511 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1512 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001513 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001514 // Compute window_step_x elements per iteration
1515 int x = window_start_x;
1516 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1517 {
1518 const float16x8x2_t ta1 =
1519 {
1520 {
1521 vld1q_f16(input1_ptr + x),
1522 vld1q_f16(input1_ptr + x + 8),
1523 }
1524 };
1525 const float16x8x2_t ta2 =
1526 {
1527 {
1528 vld1q_f16(input2_ptr + x),
1529 vld1q_f16(input2_ptr + x + 8),
1530 }
1531 };
1532 const float16x8_t scale_vec = vdupq_n_f16(scale);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001533 const float16x8x2_t result =
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001534 {
1535 {
1536 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1537 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1538 }
1539 };
1540 vst1q_f16(output_ptr + x, result.val[0]);
1541 vst1q_f16(output_ptr + x + 8, result.val[1]);
1542 }
1543 // Compute left-over elements
1544 for(; x < window_end_x; ++x)
1545 {
1546 const auto ta1 = *(input1_ptr + x);
1547 const auto ta2 = *(input2_ptr + x);
1548 *(output_ptr + x) = ta1 * ta2 * scale;
1549 }
1550 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001551 input1, input2, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001552 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001553}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001554#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +01001555
1556template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001557void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001558{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001559 // Create input windows
1560 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001561 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1562 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001563
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001564 // Clear X Dimension on execution window as we handle manually
1565 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1566 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1567 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001568
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001569 Iterator input1(src1, input1_win);
1570 Iterator input2(src2, input2_win);
1571 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001572
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001573 const int window_step_x = 16 / sizeof(uint8_t);
1574 const auto window_start_x = static_cast<int>(window.x().start());
1575 const auto window_end_x = static_cast<int>(window.x().end());
1576
Omar Al Khatib605a9282022-11-01 17:01:24 +00001577 execute_window_loop(
1578 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001579 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001580 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1581 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001582 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001583
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001584 // Compute window_step_x elements per iteration
1585 int x = window_start_x;
1586 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001587 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001588 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1589 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1590
1591 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1592 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1593 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1594 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1595
1596 if(is_scale255)
1597 {
1598 tmp_low = scale255_U16_U16(tmp_low);
1599 tmp_high = scale255_U16_U16(tmp_high);
1600 }
1601 else
1602 {
1603 const int16x8_t vn = vdupq_n_s16(-n);
1604
1605 if(is_sat)
1606 {
1607 tmp_low = vqshlq_u16(tmp_low, vn);
1608 tmp_high = vqshlq_u16(tmp_high, vn);
1609 }
1610 else
1611 {
1612 tmp_low = vshlq_u16(tmp_low, vn);
1613 tmp_high = vshlq_u16(tmp_high, vn);
1614 }
1615 }
1616
1617 if(is_sat)
1618 {
1619 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1620
1621 tmp_low = vminq_u16(tmp_low, max);
1622 tmp_high = vminq_u16(tmp_high, max);
1623 }
1624
1625 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1626 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001627 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001628
1629 // Compute left-over elements
1630 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001631 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001632 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1633
1634 if(is_scale255)
1635 {
1636 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1637 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1638 }
1639 else
1640 {
1641 tmp >>= n;
1642 }
1643
1644 if(is_sat)
1645 {
1646 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1647 }
1648
1649 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001650 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001651 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001652 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001653}
1654
1655template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001656void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001657{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001658 // Create input windows
1659 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001660 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1661 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001662
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001663 // Clear X Dimension on execution window as we handle manually
1664 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1665 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1666 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001667
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001668 Iterator input1(src1, input1_win);
1669 Iterator input2(src2, input2_win);
1670 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001671
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001672 const int window_step_x = 16;
1673 const auto window_start_x = static_cast<int>(window.x().start());
1674 const auto window_end_x = static_cast<int>(window.x().end());
1675
Omar Al Khatib605a9282022-11-01 17:01:24 +00001676 execute_window_loop(
1677 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001678 {
1679 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1680 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001681 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001682
1683 // Compute window_step_x elements per iteration
1684 int x = window_start_x;
1685 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1686 {
1687 const int16x8x2_t ta1 =
1688 {
1689 {
1690 vld1q_s16(input1_ptr + x),
1691 vld1q_s16(input1_ptr + x + 8),
1692 }
1693 };
1694 const uint8x8x2_t ta2u =
1695 {
1696 {
1697 vld1_u8(input2_ptr + x),
1698 vld1_u8(input2_ptr + x + 8),
1699 }
1700 };
1701 const int16x8x2_t ta2 =
1702 {
1703 {
1704 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1705 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1706 }
1707 };
1708
1709 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1710
1711 vst1q_s16(output_ptr + x, result.val[0]);
1712 vst1q_s16(output_ptr + x + 8, result.val[1]);
1713 }
1714
1715 // Compute left-over elements
1716 for(; x < window_end_x; ++x)
1717 {
1718 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1719
1720 if(is_scale255)
1721 {
1722 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1723
1724 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1725 }
1726 else
1727 {
1728 if(tmp >= 0)
1729 {
1730 tmp >>= n;
1731 }
1732 else
1733 {
1734 uint32_t mask = (1u << n) - 1;
1735 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1736 }
1737 }
1738 if(is_sat)
1739 {
1740 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1741 }
1742 *(output_ptr + x) = static_cast<int16_t>(tmp);
1743 }
1744 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001745 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001746}
1747
1748template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001749void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001750{
1751 // Simply swap the two input buffers
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001752 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001753}
1754} // namespace
1755
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001756void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001757{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001758 ARM_COMPUTE_UNUSED(rounding_policy);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001759 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001760
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001761 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001762
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001763 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001764
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001765 // Auto initialize dst if not initialized
1766 set_shape_if_empty(*dst, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001767
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001768 _scale = scale;
1769 _scale_exponent = 0;
1770 _func_quantized = nullptr;
1771 _func_int = nullptr;
1772 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001773
1774 bool is_scale_255 = false;
1775 // Check and validate scaling factor
1776 if(std::abs(scale - scale255_constant) < 0.00001f)
1777 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001778 is_scale_255 = true;
1779 }
1780 else
1781 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001782 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001783
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001784 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001785
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001786 // Store the positive exponent. We know that we compute 1/2^n
1787 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1788 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001789 }
1790
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001791 const DataType dt_input1 = src1->data_type();
1792 const DataType dt_input2 = src2->data_type();
1793 const DataType dt_output = dst->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001794 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1795
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001796 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001797 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001798 case DataType::QASYMM8:
1799 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1800 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001801 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1802 {
1803 _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
1804 }
1805 else
1806 {
1807 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1808 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001809 }
1810 break;
1811 case DataType::QASYMM8_SIGNED:
1812 if(dt_input2 == DataType::QASYMM8_SIGNED)
1813 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001814 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1815 {
1816 _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
1817 }
1818 else
1819 {
1820 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1821 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001822 }
1823 break;
1824 case DataType::QSYMM16:
1825 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1826 {
1827 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1828 }
1829 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1830 {
1831 _func_int = &mul_QSYMM16_QSYMM16_S32;
1832 }
1833 break;
1834 case DataType::S16:
1835 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1836 {
1837 if(is_scale_255)
1838 {
1839 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1840 }
1841 else
1842 {
1843 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1844 }
1845 }
1846 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1847 {
1848 if(is_scale_255)
1849 {
1850 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1851 }
1852 else
1853 {
1854 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1855 }
1856 }
1857 break;
SiCong Libb88f892020-08-28 11:18:47 +01001858 case DataType::S32:
1859 if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1860 {
1861 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1862 }
1863 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001864 case DataType::U8:
1865 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1866 {
1867 if(is_scale_255)
1868 {
1869 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1870 }
1871 else
1872 {
1873 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1874 }
1875 }
1876 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1877 {
1878 if(is_scale_255)
1879 {
1880 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1881 }
1882 else
1883 {
1884 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1885 }
1886 }
1887 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1888 {
1889 if(is_scale_255)
1890 {
1891 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1892 }
1893 else
1894 {
1895 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1896 }
1897 }
1898 break;
1899#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1900 case DataType::F16:
1901 _func_float = &mul_F16_F16_F16;
1902 break;
1903#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1904 case DataType::F32:
1905 _func_float = &mul_F32_F32_F32;
1906 break;
1907 default:
1908 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001909 }
1910
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001911 // Configure kernel window
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01001912 Window win;
1913 std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001914
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001915 ICpuKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001916}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001917
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001918size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
1919{
1920 ARM_COMPUTE_UNUSED(thread_count);
1921
1922#if defined(ENABLE_FP32_KERNELS)
1923 if(this->_func_float == &mul_F32_F32_F32)
1924 {
1925 size_t mws = ICPPKernel::default_mws;
1926 if(platform.get_cpu_model() == CPUModel::N1)
1927 {
1928 mws = default_mws_N1_fp32_neon;
1929 }
1930 else if(platform.get_cpu_model() == CPUModel::V1)
1931 {
1932 mws = default_mws_V1_fp32_neon;
1933 }
1934 else
1935 {
fadara01e112ef12022-11-22 18:25:55 +00001936 if(_split_dimension == Window::DimX)
1937 {
1938 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1939 // This number is loosely chosen as threading overhead in each platform varies wildly.
1940 return default_mws_other_platforms_1d_tensor;
1941 }
1942 return default_mws;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001943 }
1944
1945 // tensor is 1D or was re-interpreted as 1D
1946 if(this->window().shape().num_dimensions() == 1)
1947 {
1948 return mws;
1949 }
1950 else
1951 {
1952 // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
1953 // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
1954 // but the other sizes are large, which boosts performance.
1955 mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
1956 return std::max(static_cast<size_t>(1), mws);
1957 }
1958 }
1959#else /* ENABLE_FP32_KERNELS */
1960 ARM_COMPUTE_UNUSED(platform);
1961#endif /* ENABLE_FP32_KERNELS */
fadara01e112ef12022-11-22 18:25:55 +00001962 if(_split_dimension == Window::DimX)
1963 {
1964 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1965 // This number is loosely chosen as threading overhead in each platform varies wildly.
1966 return default_mws_other_platforms_1d_tensor;
1967 }
1968 return default_mws;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001969}
1970
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001971Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
1972 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001973{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001974 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1975 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001976
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001977 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001978}
1979
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001980void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001981{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001982 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001983 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001984 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001985
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001986 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1987 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1988 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001989
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001990 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001991 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001992 (*_func_quantized)(src1, src2, dst, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001993 }
1994 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001995 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001996 (*_func_int)(src1, src2, dst, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001997 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001998 else
1999 {
2000 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002001 (*_func_float)(src1, src2, dst, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002002 }
2003}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00002004
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002005const char *CpuMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002006{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002007 return "CpuMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002008}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00002009
giuros01154bc1c2019-03-26 17:44:40 +00002010namespace
2011{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002012Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002013{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002014 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
2015 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
giuros01154bc1c2019-03-26 17:44:40 +00002016
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002017 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00002018
2019 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
2020
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002021 // Validate in case of configured dst
2022 if(dst->total_size() > 0)
giuros01154bc1c2019-03-26 17:44:40 +00002023 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002024 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
2025 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
giuros01154bc1c2019-03-26 17:44:40 +00002026 }
2027
2028 return Status{};
2029}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01002030} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00002031
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002032void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002033{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002034 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2035 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
Sheri Zhang4d91dc62020-09-23 11:22:50 +01002036
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002037 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00002038
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002039 // Auto initialize dst if not initialized
2040 const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
2041 auto_init_if_empty(*dst, out_info);
giuros01154bc1c2019-03-26 17:44:40 +00002042
giuros01154bc1c2019-03-26 17:44:40 +00002043 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00002044 Window win = calculate_max_window(out_shape);
giuros01154bc1c2019-03-26 17:44:40 +00002045
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002046 ICpuKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00002047}
2048
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002049Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002050{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002051 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2052 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
giuros01154bc1c2019-03-26 17:44:40 +00002053
2054 return Status{};
2055}
2056
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002057void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00002058{
2059 ARM_COMPUTE_UNUSED(info);
2060 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002061 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
giuros01154bc1c2019-03-26 17:44:40 +00002062
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002063 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
2064 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
2065 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01002066
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002067 c_mul_F32_F32_F32_n(src1, src2, dst, window);
giuros01154bc1c2019-03-26 17:44:40 +00002068}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002069
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002070const char *CpuComplexMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002071{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002072 return "CpuComplexMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002073}
2074} // namespace kernels
2075} // namespace cpu
Manuel Bottini79fa9a22019-02-22 17:54:22 +00002076} // namespace arm_compute